diff options
Diffstat (limited to 'llvm/test/CodeGen')
| -rw-r--r-- | llvm/test/CodeGen/AArch64/sbc.ll | 392 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AArch64/sve-extract-element.ll | 132 | ||||
| -rw-r--r-- | llvm/test/CodeGen/PowerPC/milicode32.ll | 56 | ||||
| -rw-r--r-- | llvm/test/CodeGen/PowerPC/milicode64.ll | 79 | ||||
| -rw-r--r-- | llvm/test/CodeGen/RISCV/attributes.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/RISCV/features-info.ll | 1 | ||||
| -rw-r--r-- | llvm/test/CodeGen/RISCV/overflow-intrinsics.ll | 48 | ||||
| -rw-r--r-- | llvm/test/CodeGen/SPIRV/non_int_constant_null.ll | 25 | ||||
| -rw-r--r-- | llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll | 11 | ||||
| -rw-r--r-- | llvm/test/CodeGen/SPIRV/zero-length-array.ll | 13 | ||||
| -rw-r--r-- | llvm/test/CodeGen/WebAssembly/memory-interleave.ll | 278 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll | 20 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir | 2 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/ldexp-avx512.ll | 1288 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/pr166534.ll | 68 |
15 files changed, 2050 insertions, 367 deletions
diff --git a/llvm/test/CodeGen/AArch64/sbc.ll b/llvm/test/CodeGen/AArch64/sbc.ll new file mode 100644 index 0000000..fff63c1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sbc.ll @@ -0,0 +1,392 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck --check-prefixes=CHECK,CHECK-SD %s +; RUN: llc < %s -global-isel | FileCheck --check-prefixes=CHECK,CHECK-GI %s + +target triple = "aarch64-none-linux-gnu" + +define i32 @test_basic_i32(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_basic_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_basic_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i64 @test_basic_i64(i64 %a, i64 %b, i64 %x, i64 %y) { +; CHECK-SD-LABEL: test_basic_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, x1 +; CHECK-SD-NEXT: sbc x0, x2, x3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_basic_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp x0, x1 +; CHECK-GI-NEXT: sub x9, x2, x3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub x0, x9, x8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i64 %a, %b + %carry = zext i1 %cc to i64 + %sub = sub i64 %x, %y + %res = sub i64 %sub, %carry + ret i64 %res +} + +define i64 @test_mixed_i32_i64(i32 %a, i32 %b, i64 %x, i64 %y) { +; CHECK-SD-LABEL: test_mixed_i32_i64: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc x0, x2, x3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_mixed_i32_i64: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub x9, x2, x3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub x0, x9, x8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i64 + %sub = sub i64 %x, %y + %res = sub i64 %sub, %carry + ret i64 %res +} + +define i32 @test_mixed_i64_i32(i64 %a, i64 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_mixed_i64_i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp x0, x1 +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_mixed_i64_i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp x0, x1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i64 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_only_borrow(i32 %a, i32 %b, i32 %x) { +; CHECK-SD-LABEL: test_only_borrow: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc w0, w2, wzr +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_only_borrow: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w2, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %res = sub i32 %x, %carry + ret i32 %res +} + +define i32 @test_sext_add(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_sext_add: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_sext_add: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sbfx w8, w8, #0, #1 +; CHECK-GI-NEXT: add w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = sext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = add i32 %sub, %carry + ret i32 %res +} + +; FIXME: This case could be supported with reversed operands to the CMP. +define i32 @test_ugt(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_ugt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cset w9, hi +; CHECK-SD-NEXT: sub w0, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_ugt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, hi +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ugt i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_unsupported_cc_slt(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_unsupported_cc_slt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cset w9, lt +; CHECK-SD-NEXT: sub w0, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_unsupported_cc_slt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, lt +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp slt i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_unsupported_cc_sgt(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_unsupported_cc_sgt: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cset w9, gt +; CHECK-SD-NEXT: sub w0, w8, w9 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_unsupported_cc_sgt: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cset w8, gt +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp sgt i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + ret i32 %res +} + +define i32 @test_multiple_setcc_uses(i32 %a, i32 %b, i32 %x) { +; CHECK-SD-LABEL: test_multiple_setcc_uses: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: sub w19, w2, w0 +; CHECK-SD-NEXT: bl use +; CHECK-SD-NEXT: mov w0, w19 +; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_multiple_setcc_uses: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: mov w19, w2 +; CHECK-GI-NEXT: cset w20, lo +; CHECK-GI-NEXT: mov w0, w20 +; CHECK-GI-NEXT: bl use +; CHECK-GI-NEXT: sub w0, w19, w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %res = sub i32 %x, %carry + tail call void @use(i1 %cc) + ret i32 %res +} + +define i32 @test_multiple_carry_uses(i32 %a, i32 %b, i32 %x) { +; CHECK-SD-LABEL: test_multiple_carry_uses: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: cset w0, lo +; CHECK-SD-NEXT: sub w19, w2, w0 +; CHECK-SD-NEXT: bl use +; CHECK-SD-NEXT: mov w0, w19 +; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_multiple_carry_uses: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: mov w19, w2 +; CHECK-GI-NEXT: cset w20, lo +; CHECK-GI-NEXT: mov w0, w20 +; CHECK-GI-NEXT: bl use +; CHECK-GI-NEXT: sub w0, w19, w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %res = sub i32 %x, %carry + tail call void @use(i32 %carry) + ret i32 %res +} + +define i32 @test_multiple_sub_uses(i32 %a, i32 %b, i32 %x, i32 %y) { +; CHECK-SD-LABEL: test_multiple_sub_uses: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: sub w8, w2, w3 +; CHECK-SD-NEXT: cmp w0, w1 +; CHECK-SD-NEXT: mov w0, w8 +; CHECK-SD-NEXT: sbc w19, w2, w3 +; CHECK-SD-NEXT: bl use +; CHECK-SD-NEXT: mov w0, w19 +; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_multiple_sub_uses: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill +; CHECK-GI-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 32 +; CHECK-GI-NEXT: .cfi_offset w19, -8 +; CHECK-GI-NEXT: .cfi_offset w20, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -32 +; CHECK-GI-NEXT: sub w19, w2, w3 +; CHECK-GI-NEXT: cmp w0, w1 +; CHECK-GI-NEXT: mov w0, w19 +; CHECK-GI-NEXT: cset w20, lo +; CHECK-GI-NEXT: bl use +; CHECK-GI-NEXT: sub w0, w19, w20 +; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload +; CHECK-GI-NEXT: ret + %cc = icmp ult i32 %a, %b + %carry = zext i1 %cc to i32 + %sub = sub i32 %x, %y + %res = sub i32 %sub, %carry + tail call void @use(i32 %sub) + ret i32 %res +} + +define i8 @test_i8(i8 %a, i8 %b, i8 %x, i8 %y) { +; CHECK-SD-LABEL: test_i8: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xff +; CHECK-SD-NEXT: cmp w8, w1, uxtb +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_i8: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xff +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cmp w8, w1, uxtb +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i8 %a, %b + %carry = zext i1 %cc to i8 + %sub = sub i8 %x, %y + %res = sub i8 %sub, %carry + ret i8 %res +} + +define i16 @test_i16(i16 %a, i16 %b, i16 %x, i16 %y) { +; CHECK-SD-LABEL: test_i16: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: and w8, w0, #0xffff +; CHECK-SD-NEXT: cmp w8, w1, uxth +; CHECK-SD-NEXT: sbc w0, w2, w3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_i16: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: and w8, w0, #0xffff +; CHECK-GI-NEXT: sub w9, w2, w3 +; CHECK-GI-NEXT: cmp w8, w1, uxth +; CHECK-GI-NEXT: cset w8, lo +; CHECK-GI-NEXT: sub w0, w9, w8 +; CHECK-GI-NEXT: ret + %cc = icmp ult i16 %a, %b + %carry = zext i1 %cc to i16 + %sub = sub i16 %x, %y + %res = sub i16 %sub, %carry + ret i16 %res +} + +define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %x, <4 x i32> %y) { +; CHECK-SD-LABEL: test_v4i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: sub v2.4s, v2.4s, v3.4s +; CHECK-SD-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_v4i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v4.4s, #1 +; CHECK-GI-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sub v1.4s, v2.4s, v3.4s +; CHECK-GI-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-GI-NEXT: sub v0.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: ret + %cc = icmp ult <4 x i32> %a, %b + %carry = zext <4 x i1> %cc to <4 x i32> + %sub = sub <4 x i32> %x, %y + %res = sub <4 x i32> %sub, %carry + ret <4 x i32> %res +} + +declare void @use() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll index c340df1..0cc2e04 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll @@ -12,6 +12,26 @@ define i8 @test_lane0_16xi8(<vscale x 16 x i8> %a) #0 { ret i8 %b } +define i32 @test_lane0_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane0_16xi8_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 0 + %c = zext i8 %b to i32 + ret i32 %c +} + +define i64 @test_lane0_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane0_16xi8_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 0 + %c = zext i8 %b to i64 + ret i64 %c +} + define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 { ; CHECK-LABEL: test_lane15_16xi8: ; CHECK: // %bb.0: @@ -21,6 +41,26 @@ define i8 @test_lane15_16xi8(<vscale x 16 x i8> %a) #0 { ret i8 %b } +define i32 @test_lane15_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane15_16xi8_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.b[15] +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 15 + %c = zext i8 %b to i32 + ret i32 %c +} + +define i64 @test_lane15_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane15_16xi8_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.b[15] +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 15 + %c = zext i8 %b to i64 + ret i64 %c +} + define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 { ; CHECK-LABEL: test_lane16_16xi8: ; CHECK: // %bb.0: @@ -31,6 +71,32 @@ define i8 @test_lane16_16xi8(<vscale x 16 x i8> %a) #0 { ret i8 %b } +; FIXME: FMOV+AND -> UMOV. +define i32 @test_lane16_16xi8_zext_i32(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane16_16xi8_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.b, z0.b[16] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 16 + %c = zext i8 %b to i32 + ret i32 %c +} + +; FIXME: FMOV+AND -> UMOV. +define i64 @test_lane16_16xi8_zext_i64(<vscale x 16 x i8> %a) #0 { +; CHECK-LABEL: test_lane16_16xi8_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.b, z0.b[16] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and x0, x8, #0xff +; CHECK-NEXT: ret + %b = extractelement <vscale x 16 x i8> %a, i32 16 + %c = zext i8 %b to i64 + ret i64 %c +} + define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 { ; CHECK-LABEL: test_lane0_8xi16: ; CHECK: // %bb.0: @@ -40,6 +106,26 @@ define i16 @test_lane0_8xi16(<vscale x 8 x i16> %a) #0 { ret i16 %b } +define i32 @test_lane0_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane0_8xi16_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 0 + %c = zext i16 %b to i32 + ret i32 %c +} + +define i64 @test_lane0_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane0_8xi16_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 0 + %c = zext i16 %b to i64 + ret i64 %c +} + define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 { ; CHECK-LABEL: test_lane7_8xi16: ; CHECK: // %bb.0: @@ -49,6 +135,26 @@ define i16 @test_lane7_8xi16(<vscale x 8 x i16> %a) #0 { ret i16 %b } +define i32 @test_lane7_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane7_8xi16_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.h[7] +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 7 + %c = zext i16 %b to i32 + ret i32 %c +} + +define i64 @test_lane7_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane7_8xi16_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: umov w0, v0.h[7] +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 7 + %c = zext i16 %b to i64 + ret i64 %c +} + define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 { ; CHECK-LABEL: test_lane8_8xi16: ; CHECK: // %bb.0: @@ -59,6 +165,32 @@ define i16 @test_lane8_8xi16(<vscale x 8 x i16> %a) #0 { ret i16 %b } +; FIXME: FMOV+AND -> UMOV. +define i32 @test_lane8_8xi16_zext_i32(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane8_8xi16_zext_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, z0.h[8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0xffff +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 8 + %c = zext i16 %b to i32 + ret i32 %c +} + +; FIXME: FMOV+AND -> UMOV. +define i64 @test_lane8_8xi16_zext_i64(<vscale x 8 x i16> %a) #0 { +; CHECK-LABEL: test_lane8_8xi16_zext_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, z0.h[8] +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and x0, x8, #0xffff +; CHECK-NEXT: ret + %b = extractelement <vscale x 8 x i16> %a, i32 8 + %c = zext i16 %b to i64 + ret i64 %c +} + define i32 @test_lane0_4xi32(<vscale x 4 x i32> %a) #0 { ; CHECK-LABEL: test_lane0_4xi32: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/PowerPC/milicode32.ll b/llvm/test/CodeGen/PowerPC/milicode32.ll index 78d0362..ddadd01 100644 --- a/llvm/test/CodeGen/PowerPC/milicode32.ll +++ b/llvm/test/CodeGen/PowerPC/milicode32.ll @@ -69,3 +69,59 @@ entry: } declare i32 @strlen(ptr noundef) nounwind + +define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i32 noundef %num) #0 { +; CHECK-AIX-32-P9-LABEL: test_memmove: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: mflr r0 +; CHECK-AIX-32-P9-NEXT: stwu r1, -80(r1) +; CHECK-AIX-32-P9-NEXT: stw r0, 88(r1) +; CHECK-AIX-32-P9-NEXT: stw r31, 76(r1) # 4-byte Folded Spill +; CHECK-AIX-32-P9-NEXT: mr r31, r3 +; CHECK-AIX-32-P9-NEXT: stw r3, 72(r1) +; CHECK-AIX-32-P9-NEXT: stw r4, 68(r1) +; CHECK-AIX-32-P9-NEXT: stw r5, 64(r1) +; CHECK-AIX-32-P9-NEXT: bl .___memmove[PR] +; CHECK-AIX-32-P9-NEXT: nop +; CHECK-AIX-32-P9-NEXT: mr r3, r31 +; CHECK-AIX-32-P9-NEXT: lwz r31, 76(r1) # 4-byte Folded Reload +; CHECK-AIX-32-P9-NEXT: addi r1, r1, 80 +; CHECK-AIX-32-P9-NEXT: lwz r0, 8(r1) +; CHECK-AIX-32-P9-NEXT: mtlr r0 +; CHECK-AIX-32-P9-NEXT: blr +; +; CHECK-LINUX32-P9-LABEL: test_memmove: +; CHECK-LINUX32-P9: # %bb.0: # %entry +; CHECK-LINUX32-P9-NEXT: mflr r0 +; CHECK-LINUX32-P9-NEXT: stwu r1, -32(r1) +; CHECK-LINUX32-P9-NEXT: stw r0, 36(r1) +; CHECK-LINUX32-P9-NEXT: .cfi_def_cfa_offset 32 +; CHECK-LINUX32-P9-NEXT: .cfi_offset lr, 4 +; CHECK-LINUX32-P9-NEXT: .cfi_offset r30, -8 +; CHECK-LINUX32-P9-NEXT: stw r30, 24(r1) # 4-byte Folded Spill +; CHECK-LINUX32-P9-NEXT: mr r30, r3 +; CHECK-LINUX32-P9-NEXT: stw r3, 20(r1) +; CHECK-LINUX32-P9-NEXT: stw r4, 16(r1) +; CHECK-LINUX32-P9-NEXT: stw r5, 12(r1) +; CHECK-LINUX32-P9-NEXT: bl memmove +; CHECK-LINUX32-P9-NEXT: mr r3, r30 +; CHECK-LINUX32-P9-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload +; CHECK-LINUX32-P9-NEXT: lwz r0, 36(r1) +; CHECK-LINUX32-P9-NEXT: addi r1, r1, 32 +; CHECK-LINUX32-P9-NEXT: mtlr r0 +; CHECK-LINUX32-P9-NEXT: blr +entry: + %destination.addr = alloca ptr, align 4 + %source.addr = alloca ptr, align 4 + %num.addr = alloca i32, align 4 + store ptr %destination, ptr %destination.addr, align 4 + store ptr %source, ptr %source.addr, align 4 + store i32 %num, ptr %num.addr, align 4 + %0 = load ptr, ptr %destination.addr, align 4 + %1 = load ptr, ptr %source.addr, align 4 + %2 = load i32, ptr %num.addr, align 4 + call void @llvm.memmove.p0.p0.i32(ptr align 1 %0, ptr align 1 %1, i32 %2, i1 false) + ret ptr %0 +} + +declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg) diff --git a/llvm/test/CodeGen/PowerPC/milicode64.ll b/llvm/test/CodeGen/PowerPC/milicode64.ll index 8b87529..f7814a4 100644 --- a/llvm/test/CodeGen/PowerPC/milicode64.ll +++ b/llvm/test/CodeGen/PowerPC/milicode64.ll @@ -100,3 +100,82 @@ entry: } declare i64 @strlen(ptr noundef) nounwind + +define ptr @test_memmove(ptr noundef %destination, ptr noundef %source, i64 noundef %num) #0 { +; CHECK-LE-P9-LABEL: test_memmove: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mflr r0 +; CHECK-LE-P9-NEXT: .cfi_def_cfa_offset 80 +; CHECK-LE-P9-NEXT: .cfi_offset lr, 16 +; CHECK-LE-P9-NEXT: .cfi_offset r30, -16 +; CHECK-LE-P9-NEXT: std r30, -16(r1) # 8-byte Folded Spill +; CHECK-LE-P9-NEXT: stdu r1, -80(r1) +; CHECK-LE-P9-NEXT: std r0, 96(r1) +; CHECK-LE-P9-NEXT: mr r30, r3 +; CHECK-LE-P9-NEXT: std r3, 56(r1) +; CHECK-LE-P9-NEXT: std r4, 48(r1) +; CHECK-LE-P9-NEXT: std r5, 40(r1) +; CHECK-LE-P9-NEXT: bl memmove +; CHECK-LE-P9-NEXT: nop +; CHECK-LE-P9-NEXT: mr r3, r30 +; CHECK-LE-P9-NEXT: addi r1, r1, 80 +; CHECK-LE-P9-NEXT: ld r0, 16(r1) +; CHECK-LE-P9-NEXT: ld r30, -16(r1) # 8-byte Folded Reload +; CHECK-LE-P9-NEXT: mtlr r0 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_memmove: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: mflr r0 +; CHECK-BE-P9-NEXT: stdu r1, -160(r1) +; CHECK-BE-P9-NEXT: std r0, 176(r1) +; CHECK-BE-P9-NEXT: .cfi_def_cfa_offset 160 +; CHECK-BE-P9-NEXT: .cfi_offset lr, 16 +; CHECK-BE-P9-NEXT: .cfi_offset r30, -16 +; CHECK-BE-P9-NEXT: std r30, 144(r1) # 8-byte Folded Spill +; CHECK-BE-P9-NEXT: mr r30, r3 +; CHECK-BE-P9-NEXT: std r3, 136(r1) +; CHECK-BE-P9-NEXT: std r4, 128(r1) +; CHECK-BE-P9-NEXT: std r5, 120(r1) +; CHECK-BE-P9-NEXT: bl memmove +; CHECK-BE-P9-NEXT: nop +; CHECK-BE-P9-NEXT: mr r3, r30 +; CHECK-BE-P9-NEXT: ld r30, 144(r1) # 8-byte Folded Reload +; CHECK-BE-P9-NEXT: addi r1, r1, 160 +; CHECK-BE-P9-NEXT: ld r0, 16(r1) +; CHECK-BE-P9-NEXT: mtlr r0 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_memmove: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: mflr r0 +; CHECK-AIX-64-P9-NEXT: stdu r1, -144(r1) +; CHECK-AIX-64-P9-NEXT: std r0, 160(r1) +; CHECK-AIX-64-P9-NEXT: std r31, 136(r1) # 8-byte Folded Spill +; CHECK-AIX-64-P9-NEXT: mr r31, r3 +; CHECK-AIX-64-P9-NEXT: std r3, 128(r1) +; CHECK-AIX-64-P9-NEXT: std r4, 120(r1) +; CHECK-AIX-64-P9-NEXT: std r5, 112(r1) +; CHECK-AIX-64-P9-NEXT: bl .memmove[PR] +; CHECK-AIX-64-P9-NEXT: nop +; CHECK-AIX-64-P9-NEXT: mr r3, r31 +; CHECK-AIX-64-P9-NEXT: ld r31, 136(r1) # 8-byte Folded Reload +; CHECK-AIX-64-P9-NEXT: addi r1, r1, 144 +; CHECK-AIX-64-P9-NEXT: ld r0, 16(r1) +; CHECK-AIX-64-P9-NEXT: mtlr r0 +; CHECK-AIX-64-P9-NEXT: blr +entry: + %destination.addr = alloca ptr, align 8 + %source.addr = alloca ptr, align 8 + %num.addr = alloca i64, align 8 + store ptr %destination, ptr %destination.addr, align 8 + store ptr %source, ptr %source.addr, align 8 + store i64 %num, ptr %num.addr, align 8 + %0 = load ptr, ptr %destination.addr, align 8 + %1 = load ptr, ptr %source.addr, align 8 + %2 = load i64, ptr %num.addr, align 8 + call void @llvm.memmove.p0.p0.i64(ptr align 1 %0, ptr align 1 %1, i64 %2, i1 false) + ret ptr %0 +} + +declare void @llvm.memmove.p0.p0.i32(ptr writeonly captures(none), ptr readonly captures(none), i32, i1 immarg) diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index 22c2d81..f26d4f0 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -125,6 +125,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCDELEG %s ; RUN: llc -mtriple=riscv32 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV32SMCNTRPMF %s ; RUN: llc -mtriple=riscv32 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV32SMEPMP %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV32SMPMPMT %s ; RUN: llc -mtriple=riscv32 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV32SMRNMI %s ; RUN: llc -mtriple=riscv32 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFA %s @@ -275,6 +276,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+smcdeleg %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCDELEG %s ; RUN: llc -mtriple=riscv64 -mattr=+smcntrpmf %s -o - | FileCheck --check-prefixes=CHECK,RV64SMCNTRPMF %s ; RUN: llc -mtriple=riscv64 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV64SMEPMP %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-smpmpmt %s -o - | FileCheck --check-prefixes=CHECK,RV64SMPMPMT %s ; RUN: llc -mtriple=riscv64 -mattr=+smrnmi %s -o - | FileCheck --check-prefixes=CHECK,RV64SMRNMI %s ; RUN: llc -mtriple=riscv64 -mattr=+zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfa %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFA %s @@ -439,6 +441,7 @@ ; RV32SMCDELEG: .attribute 5, "rv32i2p1_smcdeleg1p0" ; RV32SMCNTRPMF: .attribute 5, "rv32i2p1_smcntrpmf1p0" ; RV32SMEPMP: .attribute 5, "rv32i2p1_smepmp1p0" +; RV32SMPMPMT: .attribute 5, "rv32i2p1_smpmpmt0p6" ; RV32SMRNMI: .attribute 5, "rv32i2p1_smrnmi1p0" ; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0" ; RV32ZVFBFA: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0" @@ -587,6 +590,7 @@ ; RV64SMCDELEG: .attribute 5, "rv64i2p1_smcdeleg1p0" ; RV64SMCNTRPMF: .attribute 5, "rv64i2p1_smcntrpmf1p0" ; RV64SMEPMP: .attribute 5, "rv64i2p1_smepmp1p0" +; RV64SMPMPMT: .attribute 5, "rv64i2p1_smpmpmt0p6" ; RV64SMRNMI: .attribute 5, "rv64i2p1_smrnmi1p0" ; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0" ; RV64ZVFBFA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfa0p1_zvl32b1p0" diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index cf44af6..3d9906f 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -27,6 +27,7 @@ ; CHECK-NEXT: experimental - Experimental intrinsics. ; CHECK-NEXT: experimental-p - 'P' ('Base P' (Packed SIMD)). ; CHECK-NEXT: experimental-rvm23u32 - RISC-V experimental-rvm23u32 profile. +; CHECK-NEXT: experimental-smpmpmt - 'Smpmpmt' (PMP-based Memory Types Extension). ; CHECK-NEXT: experimental-svukte - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses). ; CHECK-NEXT: experimental-xqccmp - 'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves). ; CHECK-NEXT: experimental-xqcia - 'Xqcia' (Qualcomm uC Arithmetic Extension). diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index ba6769b..0306bb1 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -232,7 +232,7 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ret i64 %Q } -; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic. +; Ensure CGP doesn't sink the compare before we have a chance to form the overflow intrinsic. define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp { ; RV32-LABEL: uaddo4: @@ -1076,41 +1076,37 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s4, -24 ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: mv s5, a5 -; RV32-NEXT: mv s3, a1 +; RV32-NEXT: mv s1, a5 +; RV32-NEXT: mv s4, a1 ; RV32-NEXT: andi a1, a5, 1 -; RV32-NEXT: beqz a1, .LBB32_8 +; RV32-NEXT: beqz a1, .LBB32_6 ; RV32-NEXT: # %bb.1: # %t ; RV32-NEXT: mv s0, a4 -; RV32-NEXT: mv s2, a3 -; RV32-NEXT: mv s1, a2 -; RV32-NEXT: mv s4, a0 -; RV32-NEXT: beq s3, a3, .LBB32_3 +; RV32-NEXT: mv s3, a3 +; RV32-NEXT: mv s2, a2 +; RV32-NEXT: mv s5, a0 +; RV32-NEXT: beq s4, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t -; RV32-NEXT: sltu s6, s3, s2 +; RV32-NEXT: sltu s6, s4, s3 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: -; RV32-NEXT: sltu s6, s4, s1 +; RV32-NEXT: sltu s6, s5, s2 ; RV32-NEXT: .LBB32_4: # %t ; RV32-NEXT: mv a0, s6 ; RV32-NEXT: call call -; RV32-NEXT: beqz s6, .LBB32_8 +; RV32-NEXT: beqz s6, .LBB32_6 ; RV32-NEXT: # %bb.5: # %end -; RV32-NEXT: sltu a1, s4, s1 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beq s3, s2, .LBB32_7 -; RV32-NEXT: # %bb.6: # %end -; RV32-NEXT: sltu a0, s3, s2 -; RV32-NEXT: .LBB32_7: # %end -; RV32-NEXT: sub a2, s3, s2 -; RV32-NEXT: sub a3, s4, s1 -; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: sw a3, 0(s0) -; RV32-NEXT: sw a2, 4(s0) -; RV32-NEXT: j .LBB32_9 -; RV32-NEXT: .LBB32_8: # %f -; RV32-NEXT: mv a0, s5 -; RV32-NEXT: .LBB32_9: # %f +; RV32-NEXT: sltu a0, s5, s2 +; RV32-NEXT: sub a1, s4, s3 +; RV32-NEXT: sub a2, s5, s2 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sw a2, 0(s0) +; RV32-NEXT: sw a1, 4(s0) +; RV32-NEXT: mv a0, s6 +; RV32-NEXT: j .LBB32_7 +; RV32-NEXT: .LBB32_6: # %f +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: .LBB32_7: # %f ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll new file mode 100644 index 0000000..0ba016a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/non_int_constant_null.ll @@ -0,0 +1,25 @@ +; RUN: llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -mtriple spirv64-unknown-unknown %s --spirv-ext=+SPV_KHR_float_controls2 -o - -filetype=obj | spirv-val %} + +@A = addrspace(1) constant [1 x i8] zeroinitializer + +; CHECK: OpName %[[#FOO:]] "foo" +; CHECK: OpName %[[#A:]] "A" +; CHECK: OpDecorate %[[#A]] Constant +; CHECK: OpDecorate %[[#A]] LinkageAttributes "A" Export +; CHECK: %[[#INT8:]] = OpTypeInt 8 0 +; CHECK: %[[#INT32:]] = OpTypeInt 32 0 +; CHECK: %[[#ONE:]] = OpConstant %[[#INT32]] 1 +; CHECK: %[[#ARR_INT8:]] = OpTypeArray %[[#INT8]] %7 +; CHECK: %[[#ARR_INT8_PTR:]] = OpTypePointer CrossWorkgroup %[[#ARR_INT8]] +; CHECK: %[[#ARR_INT8_ZERO:]] = OpConstantNull %[[#ARR_INT8]] +; CHECK: %13 = OpVariable %[[#ARR_INT8_PTR]] CrossWorkgroup %[[#ARR_INT8_ZERO]] +; CHECK: %[[#FOO]] = OpFunction +; CHECK: = OpLabel +; CHECK: OpReturn +; CHECK: OpFunctionEnd + +define spir_kernel void @foo() { +entry: + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll new file mode 100644 index 0000000..1d3ba2a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opencl/unpackhalf2x16-error.ll @@ -0,0 +1,11 @@ +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: %5:vfid(<2 x s64>) = nnan ninf nsz arcp afn reassoc G_INTRINSIC intrinsic(@llvm.spv.unpackhalf2x16), %0:iid(s64) is only supported with the GLSL extended instruction set. + +define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 { + %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0) + %3 = extractelement <2 x float> %2, i64 0 + ret float %3 +} + diff --git a/llvm/test/CodeGen/SPIRV/zero-length-array.ll b/llvm/test/CodeGen/SPIRV/zero-length-array.ll index 666176c..5fd94d2 100644 --- a/llvm/test/CodeGen/SPIRV/zero-length-array.ll +++ b/llvm/test/CodeGen/SPIRV/zero-length-array.ll @@ -1,10 +1,17 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - -filetype=obj | spirv-val %} -; CHECK: %[[#type:]] = OpTypeInt 32 0 -; CHECK: %[[#ext:]] = OpConstant %[[#type]] 0 +; Nothing is generated, but compilation doesn't crash. +; CHECK: OpName %[[#FOO:]] "foo" +; CHECK: OpName %[[#RTM:]] "reg2mem alloca point" +; CHECK: %[[#INT:]] = OpTypeInt 32 0 +; CHECK: %[[#RTM]] = OpConstant %[[#INT]] 0 +; CHECK: %[[#FOO]] = OpFunction +; CHECK-NEXT: = OpLabel +; CHECK-NEXT: OpReturn +; CHECK-NEXT: OpFunctionEnd -define spir_func void @_Z3foov() { +define spir_func void @foo() { entry: %i = alloca [0 x i32], align 4 ret void diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll index 404db23..5d58ae2 100644 --- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll +++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll @@ -1720,28 +1720,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: two_floats_two_bytes_same_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.const 255, 255, 255, 255 -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: v128.store64_lane +; CHECK-NOT: v128.load define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp22.not = icmp eq i32 %N, 0 @@ -1774,28 +1753,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: two_floats_two_bytes_vary_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 -; CHECK: f32x4.add -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.const 255, 255, 255, 255 -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 -; CHECK: f32x4.sub -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: v128.store64_lane +; CHECK-NOT: v128.load define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp21.not = icmp eq i32 %N, 0 @@ -2347,64 +2305,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: four_floats_four_bytes_same_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.const 255, 255, 255, 255 -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31 -; CHECK: v128.store +; CHECK-NOT: v128.load define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp48.not = icmp eq i32 %N, 0 @@ -2453,64 +2354,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: four_floats_four_bytes_vary_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.const 255, 255, 255, 255 -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.add -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.div -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.sub -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.narrow_i16x8_u -; CHECK: i8x16.shuffle 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31 -; CHECK: v128.store +; CHECK-NOT: v128.load define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp45.not = icmp eq i32 %N, 0 @@ -2757,62 +2601,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: four_floats_four_shorts_same_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.const 65535, 65535, 65535, 65535 -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31 -; CHECK: v128.store -; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27 -; CHECK: v128.store +; CHECK-NOT: v128.load define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp48.not = icmp eq i32 %N, 0 @@ -2861,62 +2650,7 @@ for.body: ; preds = %entry, %for.body } ; CHECK-LABEL: four_floats_four_shorts_vary_op: -; CHECK: loop -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: v128.load -; CHECK: v128.load -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.mul -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.const 65535, 65535, 65535, 65535 -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.add -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.div -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31 -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 -; CHECK: f32x4.sub -; CHECK: i32x4.trunc_sat_f32x4_s -; CHECK: v128.and -; CHECK: i16x8.narrow_i32x4_u -; CHECK: i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 -; CHECK: i8x16.shuffle 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31 -; CHECK: v128.store -; CHECK: i8x16.shuffle 0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27 -; CHECK: v128.store +; CHECK-NOT: v128.load define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) { entry: %cmp45.not = icmp eq i32 %N, 0 diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll index 1962dde..f2b4c49 100644 --- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll +++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll @@ -36,10 +36,10 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edi, %edi ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: Ltmp0: +; CHECK-NEXT: Ltmp0: ## EH_LABEL ; CHECK-NEXT: ## implicit-def: $ebx ; CHECK-NEXT: calll __Znam -; CHECK-NEXT: Ltmp1: +; CHECK-NEXT: Ltmp1: ## EH_LABEL ; CHECK-NEXT: ## %bb.1: ## %bb11 ; CHECK-NEXT: movl %eax, %esi ; CHECK-NEXT: movb $1, %al @@ -58,13 +58,13 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: jne LBB0_9 ; CHECK-NEXT: ## %bb.10: ## %bb41 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 -; CHECK-NEXT: Ltmp2: +; CHECK-NEXT: Ltmp2: ## EH_LABEL ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %esi, (%esp) ; CHECK-NEXT: calll _Pjii -; CHECK-NEXT: Ltmp3: +; CHECK-NEXT: Ltmp3: ## EH_LABEL ; CHECK-NEXT: ## %bb.11: ## %bb42 ; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1 ; CHECK-NEXT: xorl %eax, %eax @@ -126,20 +126,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill ; CHECK-NEXT: jmp LBB0_8 ; CHECK-NEXT: LBB0_18: ## %bb43 -; CHECK-NEXT: Ltmp5: +; CHECK-NEXT: Ltmp5: ## EH_LABEL ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp6: +; CHECK-NEXT: Ltmp6: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_3 ; CHECK-NEXT: LBB0_2: ## %bb29 -; CHECK-NEXT: Ltmp7: +; CHECK-NEXT: Ltmp7: ## EH_LABEL ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: calll _OnOverFlow -; CHECK-NEXT: Ltmp8: +; CHECK-NEXT: Ltmp8: ## EH_LABEL ; CHECK-NEXT: LBB0_3: ## %bb30 ; CHECK-NEXT: ud2 ; CHECK-NEXT: LBB0_4: ## %bb20.loopexit -; CHECK-NEXT: Ltmp4: +; CHECK-NEXT: Ltmp4: ## EH_LABEL ; CHECK-NEXT: LBB0_9: ; CHECK-NEXT: movl %esi, %ebx ; CHECK-NEXT: LBB0_6: ## %bb23 @@ -151,7 +151,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl ; CHECK-NEXT: LBB0_5: ## %bb20.loopexit.split-lp -; CHECK-NEXT: Ltmp9: +; CHECK-NEXT: Ltmp9: ## EH_LABEL ; CHECK-NEXT: jmp LBB0_6 ; CHECK-NEXT: Lfunc_end0: bb: diff --git a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir index 348a290..2445306 100644 --- a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir +++ b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir @@ -55,7 +55,7 @@ !9 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 4, type: !10) !10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned) !11 = !DILocation(line: 4, column: 1, scope: !5) - !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) + !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7) ... --- diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll index ea93a91..21491bc 100644 --- a/llvm/test/CodeGen/X86/ldexp-avx512.ll +++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll @@ -47,6 +47,187 @@ entry: } declare fp128 @ldexpl(fp128, i32) memory(none) +define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind { +; AVX512-LABEL: test_ldexp_8xhalf: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $88, %rsp +; AVX512-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm2, %xmm0 +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512-NEXT: addq $88, %rsp +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_8xhalf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: subq $88, %rsp +; AVX512VL-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm2, %xmm0 +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0] +; AVX512VL-NEXT: addq $88, %rsp +; AVX512VL-NEXT: retq + %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> %x, <8 x i16> %exp) + ret <8 x half> %r +} +declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>) + define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind { ; CHECK-LABEL: test_ldexp_4xfloat: ; CHECK: # %bb.0: @@ -109,6 +290,381 @@ define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwi } declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) +define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind { +; AVX512-LABEL: test_ldexp_16xhalf: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $168, %rsp +; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX512-NEXT: addq $168, %rsp +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_16xhalf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: subq $168, %rsp +; AVX512VL-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX512VL-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] +; AVX512VL-NEXT: addq $168, %rsp +; AVX512VL-NEXT: retq + %r = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> %x, <16 x i16> %exp) + ret <16 x half> %r +} +declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>) + define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind { ; CHECK-LABEL: test_ldexp_8xfloat: ; CHECK: # %bb.0: @@ -230,6 +786,735 @@ define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwi } declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>) +define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind { +; AVX512-LABEL: test_ldexp_32xhalf: +; AVX512: # %bb.0: +; AVX512-NEXT: subq $360, %rsp # imm = 0x168 +; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vextracti32x4 $2, %zmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $7, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $6, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $5, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $4, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $3, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $2, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vpextrw $1, %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512-NEXT: vmovd %xmm1, %eax +; AVX512-NEXT: movswl %ax, %edi +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: callq ldexpf@PLT +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; AVX512-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: test_ldexp_32xhalf: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: subq $360, %rsp # imm = 0x168 +; AVX512VL-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm1 +; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vextracti32x4 $2, %zmm1, %xmm1 +; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX512VL-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $6, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512VL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $5, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,0] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $4, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512VL-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrlq $48, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $3, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $2, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512VL-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vpextrw $1, %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512VL-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; AVX512VL-NEXT: vmovd %xmm1, %eax +; AVX512VL-NEXT: movswl %ax, %edi +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: callq ldexpf@PLT +; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512VL-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; AVX512VL-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; AVX512VL-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; AVX512VL-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload +; AVX512VL-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload +; AVX512VL-NEXT: # zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; AVX512VL-NEXT: addq $360, %rsp # imm = 0x168 +; AVX512VL-NEXT: retq + %r = call <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half> %x, <32 x i16> %exp) + ret <32 x half> %r +} +declare <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half>, <32 x i16>) + define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind { ; CHECK-LABEL: test_ldexp_16xfloat: ; CHECK: # %bb.0: @@ -462,6 +1747,3 @@ define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwi } declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX512: {{.*}} -; AVX512VL: {{.*}} diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll index aef44cc..162a0c9 100644 --- a/llvm/test/CodeGen/X86/pr166534.ll +++ b/llvm/test/CodeGen/X86/pr166534.ll @@ -7,100 +7,64 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) { ; SSE2-LABEL: pr166534: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %r8 ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movq (%rsi), %r9 -; SSE2-NEXT: movq 8(%rsi), %rdi ; SSE2-NEXT: movdqu (%rsi), %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %esi -; SSE2-NEXT: xorl %r10d, %r10d +; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF -; SSE2-NEXT: sete %r10b -; SSE2-NEXT: orq %r10, (%rdx) +; SSE2-NEXT: sete %al +; SSE2-NEXT: orq %rax, (%rdx) ; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF ; SSE2-NEXT: jne .LBB0_2 ; SSE2-NEXT: # %bb.1: # %if.then -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %rdi, %r8 -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: orq %rax, %r8 -; SSE2-NEXT: sete %dl -; SSE2-NEXT: orq %rdx, (%rcx) +; SSE2-NEXT: orq %rax, (%rcx) ; SSE2-NEXT: .LBB0_2: # %if.end ; SSE2-NEXT: retq ; ; SSE4-LABEL: pr166534: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movq (%rdi), %rax -; SSE4-NEXT: movq 8(%rdi), %r8 ; SSE4-NEXT: movdqu (%rdi), %xmm0 -; SSE4-NEXT: movq (%rsi), %r9 -; SSE4-NEXT: movq 8(%rsi), %rdi ; SSE4-NEXT: movdqu (%rsi), %xmm1 ; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: xorl %esi, %esi +; SSE4-NEXT: xorl %eax, %eax ; SSE4-NEXT: ptest %xmm1, %xmm1 -; SSE4-NEXT: sete %sil -; SSE4-NEXT: orq %rsi, (%rdx) +; SSE4-NEXT: sete %al +; SSE4-NEXT: orq %rax, (%rdx) ; SSE4-NEXT: ptest %xmm1, %xmm1 ; SSE4-NEXT: jne .LBB0_2 ; SSE4-NEXT: # %bb.1: # %if.then -; SSE4-NEXT: xorq %r9, %rax -; SSE4-NEXT: xorq %rdi, %r8 -; SSE4-NEXT: xorl %edx, %edx -; SSE4-NEXT: orq %rax, %r8 -; SSE4-NEXT: sete %dl -; SSE4-NEXT: orq %rdx, (%rcx) +; SSE4-NEXT: orq %rax, (%rcx) ; SSE4-NEXT: .LBB0_2: # %if.end ; SSE4-NEXT: retq ; ; AVX2-LABEL: pr166534: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %r8 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: movq (%rsi), %rdi ; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: movq 8(%rsi), %rsi -; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: vptest %xmm0, %xmm0 -; AVX2-NEXT: sete %r9b -; AVX2-NEXT: orq %r9, (%rdx) +; AVX2-NEXT: sete %al +; AVX2-NEXT: orq %rax, (%rdx) ; AVX2-NEXT: vptest %xmm0, %xmm0 ; AVX2-NEXT: jne .LBB0_2 ; AVX2-NEXT: # %bb.1: # %if.then -; AVX2-NEXT: xorq %rdi, %rax -; AVX2-NEXT: xorq %rsi, %r8 -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: sete %dl -; AVX2-NEXT: orq %rdx, (%rcx) +; AVX2-NEXT: orq %rax, (%rcx) ; AVX2-NEXT: .LBB0_2: # %if.end ; AVX2-NEXT: retq ; ; AVX512-LABEL: pr166534: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %r8 ; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: movq (%rsi), %r9 -; AVX512-NEXT: movq 8(%rsi), %rdi ; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: xorl %eax, %eax ; AVX512-NEXT: vptest %xmm0, %xmm0 -; AVX512-NEXT: sete %sil -; AVX512-NEXT: orq %rsi, (%rdx) +; AVX512-NEXT: sete %al +; AVX512-NEXT: orq %rax, (%rdx) ; AVX512-NEXT: vptest %xmm0, %xmm0 ; AVX512-NEXT: jne .LBB0_2 ; AVX512-NEXT: # %bb.1: # %if.then -; AVX512-NEXT: xorq %r9, %rax -; AVX512-NEXT: xorq %rdi, %r8 -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: sete %dl -; AVX512-NEXT: orq %rdx, (%rcx) +; AVX512-NEXT: orq %rax, (%rcx) ; AVX512-NEXT: .LBB0_2: # %if.end ; AVX512-NEXT: retq entry: |
