aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll11
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll11
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll351
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-split-sve.mir133
-rw-r--r--llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll49
-rw-r--r--llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/callbr.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll51
-rw-r--r--llvm/test/CodeGen/AMDGPU/infinite-loop.ll257
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll100
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll161
-rw-r--r--llvm/test/CodeGen/AMDGPU/update-phi.ll39
-rw-r--r--llvm/test/CodeGen/BPF/bpf_trap.ll32
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll116
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll68
-rw-r--r--llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll11
-rw-r--r--llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll41
-rw-r--r--llvm/test/CodeGen/RISCV/rv64-stackmap.ll10
-rw-r--r--llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll14
-rw-r--r--llvm/test/CodeGen/X86/bittest-big-integer.ll7083
-rw-r--r--llvm/test/CodeGen/X86/isel-llvm.sincos.ll133
-rw-r--r--llvm/test/CodeGen/X86/llvm.sincos.vec.ll404
-rw-r--r--llvm/test/DebugInfo/debug-bool-const-value.ll29
-rw-r--r--llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt6
-rw-r--r--llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt6
-rw-r--r--llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s8
-rw-r--r--llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll12
-rw-r--r--llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll5
-rw-r--r--llvm/test/Transforms/InstCombine/or.ll95
-rw-r--r--llvm/test/Transforms/InstCombine/select-safe-transforms.ll51
-rw-r--r--llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll30
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll59
-rw-r--r--llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll25
-rw-r--r--llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll247
-rw-r--r--llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll17
-rw-r--r--llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll46
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll40
-rw-r--r--llvm/test/Transforms/StructurizeCFG/callbr.ll235
-rw-r--r--llvm/test/lit.cfg.py7
-rw-r--r--llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test33
-rw-r--r--llvm/test/tools/dsymutil/cmdline.test1
43 files changed, 3165 insertions, 6943 deletions
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
index fa53a18..1920fc9 100644
--- a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
@@ -1,17 +1,6 @@
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=2048 | FileCheck %s -D#VBITS=2048
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll
index df40a96..e128987 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fixed-length.ll
@@ -1,19 +1,8 @@
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output | FileCheck %s -D#VBITS=128
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=128 | FileCheck %s -D#VBITS=128
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024
; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -aarch64-sve-vector-bits-min=2048 | FileCheck %s -D#VBITS=2048
; VBITS represents the useful bit size of a vector register from the code
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index b54f262..4894932 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -755,199 +755,117 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-NEXT: cbz w2, .LBB6_3
; CHECK-SD-NEXT: // %bb.1: // %iter.check
-; CHECK-SD-NEXT: str x25, [sp, #-64]! // 8-byte Folded Spill
-; CHECK-SD-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-SD-NEXT: .cfi_def_cfa_offset 64
-; CHECK-SD-NEXT: .cfi_offset w19, -8
-; CHECK-SD-NEXT: .cfi_offset w20, -16
-; CHECK-SD-NEXT: .cfi_offset w21, -24
-; CHECK-SD-NEXT: .cfi_offset w22, -32
-; CHECK-SD-NEXT: .cfi_offset w23, -40
-; CHECK-SD-NEXT: .cfi_offset w24, -48
-; CHECK-SD-NEXT: .cfi_offset w25, -64
-; CHECK-SD-NEXT: sxtb x9, w1
; CHECK-SD-NEXT: cmp w2, #3
-; CHECK-SD-NEXT: mov w10, w2
+; CHECK-SD-NEXT: mov w9, w2
; CHECK-SD-NEXT: b.hi .LBB6_4
; CHECK-SD-NEXT: // %bb.2:
-; CHECK-SD-NEXT: mov x11, xzr
+; CHECK-SD-NEXT: mov x10, xzr
; CHECK-SD-NEXT: mov x8, xzr
; CHECK-SD-NEXT: b .LBB6_13
; CHECK-SD-NEXT: .LBB6_3:
-; CHECK-SD-NEXT: mov x0, xzr
+; CHECK-SD-NEXT: mov x8, xzr
+; CHECK-SD-NEXT: mov x0, x8
; CHECK-SD-NEXT: ret
; CHECK-SD-NEXT: .LBB6_4: // %vector.main.loop.iter.check
-; CHECK-SD-NEXT: dup v0.2d, x9
; CHECK-SD-NEXT: cmp w2, #16
; CHECK-SD-NEXT: b.hs .LBB6_6
; CHECK-SD-NEXT: // %bb.5:
-; CHECK-SD-NEXT: mov x11, xzr
+; CHECK-SD-NEXT: mov x10, xzr
; CHECK-SD-NEXT: mov x8, xzr
; CHECK-SD-NEXT: b .LBB6_10
; CHECK-SD-NEXT: .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT: mov w8, w1
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT: mov x8, v0.d[1]
-; CHECK-SD-NEXT: and x12, x10, #0xc
+; CHECK-SD-NEXT: sxtb x8, w8
+; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v6.2d, #0000000000000000
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT: and x11, x10, #0xfffffff0
-; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT: and x11, x9, #0xc
; CHECK-SD-NEXT: movi v7.2d, #0000000000000000
-; CHECK-SD-NEXT: mov x15, x0
; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
-; CHECK-SD-NEXT: and x16, x10, #0xfffffff0
-; CHECK-SD-NEXT: movi v6.2d, #0000000000000000
-; CHECK-SD-NEXT: fmov x13, d0
-; CHECK-SD-NEXT: fmov x14, d0
+; CHECK-SD-NEXT: and x10, x9, #0xfffffff0
+; CHECK-SD-NEXT: dup v16.4s, w8
+; CHECK-SD-NEXT: mov x8, x0
+; CHECK-SD-NEXT: and x12, x9, #0xfffffff0
; CHECK-SD-NEXT: .LBB6_7: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldr q17, [x15], #16
-; CHECK-SD-NEXT: subs x16, x16, #16
+; CHECK-SD-NEXT: ldr q17, [x8], #16
+; CHECK-SD-NEXT: subs x12, x12, #16
; CHECK-SD-NEXT: ushll v18.8h, v17.8b, #0
-; CHECK-SD-NEXT: ushll2 v19.8h, v17.16b, #0
-; CHECK-SD-NEXT: ushll v17.4s, v18.4h, #0
-; CHECK-SD-NEXT: ushll2 v20.4s, v19.8h, #0
-; CHECK-SD-NEXT: ushll2 v18.4s, v18.8h, #0
-; CHECK-SD-NEXT: ushll v19.4s, v19.4h, #0
-; CHECK-SD-NEXT: ushll v21.2d, v17.2s, #0
-; CHECK-SD-NEXT: ushll2 v22.2d, v20.4s, #0
-; CHECK-SD-NEXT: ushll2 v17.2d, v17.4s, #0
-; CHECK-SD-NEXT: ushll v23.2d, v18.2s, #0
-; CHECK-SD-NEXT: ushll v20.2d, v20.2s, #0
-; CHECK-SD-NEXT: ushll2 v18.2d, v18.4s, #0
-; CHECK-SD-NEXT: fmov x17, d21
-; CHECK-SD-NEXT: mov x2, v21.d[1]
-; CHECK-SD-NEXT: ushll v21.2d, v19.2s, #0
-; CHECK-SD-NEXT: ushll2 v19.2d, v19.4s, #0
-; CHECK-SD-NEXT: fmov x18, d22
-; CHECK-SD-NEXT: fmov x1, d17
-; CHECK-SD-NEXT: fmov x3, d23
-; CHECK-SD-NEXT: fmov x21, d20
-; CHECK-SD-NEXT: fmov x22, d18
-; CHECK-SD-NEXT: fmov x19, d21
-; CHECK-SD-NEXT: mul x17, x13, x17
-; CHECK-SD-NEXT: mov x4, v22.d[1]
-; CHECK-SD-NEXT: fmov x24, d19
-; CHECK-SD-NEXT: mov x5, v23.d[1]
-; CHECK-SD-NEXT: mov x6, v21.d[1]
-; CHECK-SD-NEXT: mov x7, v20.d[1]
-; CHECK-SD-NEXT: mov x20, v18.d[1]
-; CHECK-SD-NEXT: mov x23, v19.d[1]
-; CHECK-SD-NEXT: mov x25, v17.d[1]
-; CHECK-SD-NEXT: mul x18, x14, x18
-; CHECK-SD-NEXT: mul x1, x13, x1
-; CHECK-SD-NEXT: fmov d17, x17
-; CHECK-SD-NEXT: mul x3, x13, x3
-; CHECK-SD-NEXT: fmov d18, x18
-; CHECK-SD-NEXT: mul x19, x13, x19
-; CHECK-SD-NEXT: fmov d19, x1
-; CHECK-SD-NEXT: mul x21, x13, x21
-; CHECK-SD-NEXT: fmov d20, x3
-; CHECK-SD-NEXT: mul x22, x13, x22
-; CHECK-SD-NEXT: fmov d21, x19
-; CHECK-SD-NEXT: mul x24, x13, x24
-; CHECK-SD-NEXT: fmov d24, x21
-; CHECK-SD-NEXT: mul x2, x8, x2
-; CHECK-SD-NEXT: fmov d22, x22
-; CHECK-SD-NEXT: mul x4, x8, x4
-; CHECK-SD-NEXT: fmov d23, x24
-; CHECK-SD-NEXT: mul x5, x8, x5
-; CHECK-SD-NEXT: mov v17.d[1], x2
-; CHECK-SD-NEXT: mul x6, x8, x6
-; CHECK-SD-NEXT: mov v18.d[1], x4
-; CHECK-SD-NEXT: mul x7, x8, x7
-; CHECK-SD-NEXT: mov v20.d[1], x5
-; CHECK-SD-NEXT: add v1.2d, v17.2d, v1.2d
-; CHECK-SD-NEXT: mul x20, x8, x20
-; CHECK-SD-NEXT: mov v21.d[1], x6
-; CHECK-SD-NEXT: add v6.2d, v18.2d, v6.2d
-; CHECK-SD-NEXT: mul x23, x8, x23
-; CHECK-SD-NEXT: mov v24.d[1], x7
-; CHECK-SD-NEXT: add v4.2d, v20.2d, v4.2d
-; CHECK-SD-NEXT: mul x17, x8, x25
-; CHECK-SD-NEXT: mov v22.d[1], x20
-; CHECK-SD-NEXT: add v7.2d, v21.2d, v7.2d
-; CHECK-SD-NEXT: mov v23.d[1], x23
-; CHECK-SD-NEXT: add v16.2d, v24.2d, v16.2d
-; CHECK-SD-NEXT: mov v19.d[1], x17
-; CHECK-SD-NEXT: add v3.2d, v22.2d, v3.2d
-; CHECK-SD-NEXT: add v5.2d, v23.2d, v5.2d
-; CHECK-SD-NEXT: add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT: ushll2 v17.8h, v17.16b, #0
+; CHECK-SD-NEXT: ushll2 v19.4s, v18.8h, #0
+; CHECK-SD-NEXT: ushll v20.4s, v17.4h, #0
+; CHECK-SD-NEXT: ushll v18.4s, v18.4h, #0
+; CHECK-SD-NEXT: ushll2 v17.4s, v17.8h, #0
+; CHECK-SD-NEXT: smlal2 v2.2d, v16.4s, v19.4s
+; CHECK-SD-NEXT: smlal2 v4.2d, v16.4s, v20.4s
+; CHECK-SD-NEXT: smlal v6.2d, v16.2s, v20.2s
+; CHECK-SD-NEXT: smlal v3.2d, v16.2s, v19.2s
+; CHECK-SD-NEXT: smlal2 v1.2d, v16.4s, v18.4s
+; CHECK-SD-NEXT: smlal v7.2d, v16.2s, v17.2s
+; CHECK-SD-NEXT: smlal v0.2d, v16.2s, v18.2s
+; CHECK-SD-NEXT: smlal2 v5.2d, v16.4s, v17.4s
; CHECK-SD-NEXT: b.ne .LBB6_7
; CHECK-SD-NEXT: // %bb.8: // %middle.block
-; CHECK-SD-NEXT: add v1.2d, v1.2d, v7.2d
-; CHECK-SD-NEXT: add v4.2d, v4.2d, v16.2d
-; CHECK-SD-NEXT: cmp x11, x10
-; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d
-; CHECK-SD-NEXT: add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT: add v0.2d, v0.2d, v6.2d
+; CHECK-SD-NEXT: add v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT: cmp x10, x9
; CHECK-SD-NEXT: add v1.2d, v1.2d, v4.2d
-; CHECK-SD-NEXT: add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
-; CHECK-SD-NEXT: addp d1, v1.2d
-; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT: addp d0, v0.2d
+; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: b.eq .LBB6_15
; CHECK-SD-NEXT: // %bb.9: // %vec.epilog.iter.check
-; CHECK-SD-NEXT: cbz x12, .LBB6_13
+; CHECK-SD-NEXT: cbz x11, .LBB6_13
; CHECK-SD-NEXT: .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT: mov w11, w1
; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
-; CHECK-SD-NEXT: mov x13, x11
+; CHECK-SD-NEXT: sxtb x11, w11
; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff
-; CHECK-SD-NEXT: fmov x14, d0
-; CHECK-SD-NEXT: and x11, x10, #0xfffffffc
-; CHECK-SD-NEXT: fmov x15, d0
-; CHECK-SD-NEXT: sub x12, x13, x11
-; CHECK-SD-NEXT: add x13, x0, x13
-; CHECK-SD-NEXT: mov v1.d[0], x8
-; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: dup v2.2s, w11
+; CHECK-SD-NEXT: mov x11, x10
+; CHECK-SD-NEXT: and x10, x9, #0xfffffffc
+; CHECK-SD-NEXT: mov v0.d[0], x8
+; CHECK-SD-NEXT: sub x8, x11, x10
+; CHECK-SD-NEXT: add x11, x0, x11
; CHECK-SD-NEXT: .LBB6_11: // %vec.epilog.vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldr s0, [x13], #4
-; CHECK-SD-NEXT: adds x12, x12, #4
-; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT: ushll v4.2d, v0.2s, #0
-; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT: ldr s4, [x11], #4
+; CHECK-SD-NEXT: adds x8, x8, #4
+; CHECK-SD-NEXT: ushll v4.8h, v4.8b, #0
+; CHECK-SD-NEXT: ushll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT: ushll v5.2d, v4.2s, #0
+; CHECK-SD-NEXT: ushll2 v4.2d, v4.4s, #0
+; CHECK-SD-NEXT: and v5.16b, v5.16b, v3.16b
; CHECK-SD-NEXT: and v4.16b, v4.16b, v3.16b
-; CHECK-SD-NEXT: and v0.16b, v0.16b, v3.16b
-; CHECK-SD-NEXT: fmov x16, d4
-; CHECK-SD-NEXT: fmov x18, d0
-; CHECK-SD-NEXT: mov x17, v4.d[1]
-; CHECK-SD-NEXT: mov x1, v0.d[1]
-; CHECK-SD-NEXT: mul x16, x14, x16
-; CHECK-SD-NEXT: mul x18, x15, x18
-; CHECK-SD-NEXT: mul x17, x8, x17
-; CHECK-SD-NEXT: fmov d0, x16
-; CHECK-SD-NEXT: mul x1, x8, x1
-; CHECK-SD-NEXT: fmov d4, x18
-; CHECK-SD-NEXT: mov v0.d[1], x17
-; CHECK-SD-NEXT: mov v4.d[1], x1
-; CHECK-SD-NEXT: add v1.2d, v0.2d, v1.2d
-; CHECK-SD-NEXT: add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT: xtn v5.2s, v5.2d
+; CHECK-SD-NEXT: xtn v4.2s, v4.2d
+; CHECK-SD-NEXT: smlal v1.2d, v2.2s, v4.2s
+; CHECK-SD-NEXT: smlal v0.2d, v2.2s, v5.2s
; CHECK-SD-NEXT: b.ne .LBB6_11
; CHECK-SD-NEXT: // %bb.12: // %vec.epilog.middle.block
-; CHECK-SD-NEXT: add v0.2d, v1.2d, v2.2d
-; CHECK-SD-NEXT: cmp x11, x10
+; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT: cmp x10, x9
; CHECK-SD-NEXT: addp d0, v0.2d
; CHECK-SD-NEXT: fmov x8, d0
; CHECK-SD-NEXT: b.eq .LBB6_15
; CHECK-SD-NEXT: .LBB6_13: // %for.body.preheader
-; CHECK-SD-NEXT: sub x10, x10, x11
-; CHECK-SD-NEXT: add x11, x0, x11
+; CHECK-SD-NEXT: sxtb x11, w1
+; CHECK-SD-NEXT: sub x9, x9, x10
+; CHECK-SD-NEXT: add x10, x0, x10
; CHECK-SD-NEXT: .LBB6_14: // %for.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT: ldrb w12, [x11], #1
-; CHECK-SD-NEXT: subs x10, x10, #1
-; CHECK-SD-NEXT: smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT: ldrb w12, [x10], #1
+; CHECK-SD-NEXT: subs x9, x9, #1
+; CHECK-SD-NEXT: smaddl x8, w12, w11, x8
; CHECK-SD-NEXT: b.ne .LBB6_14
-; CHECK-SD-NEXT: .LBB6_15:
-; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT: ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT: .LBB6_15: // %for.cond.cleanup
; CHECK-SD-NEXT: mov x0, x8
; CHECK-SD-NEXT: ret
;
@@ -957,63 +875,64 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
; CHECK-GI-NEXT: cbz w2, .LBB6_7
; CHECK-GI-NEXT: // %bb.1: // %iter.check
; CHECK-GI-NEXT: movi d0, #0000000000000000
-; CHECK-GI-NEXT: sxtb x9, w1
-; CHECK-GI-NEXT: mov x11, xzr
+; CHECK-GI-NEXT: mov x10, xzr
; CHECK-GI-NEXT: cmp w2, #4
-; CHECK-GI-NEXT: mov w10, w2
+; CHECK-GI-NEXT: mov w9, w2
; CHECK-GI-NEXT: b.lo .LBB6_12
; CHECK-GI-NEXT: // %bb.2: // %vector.main.loop.iter.check
; CHECK-GI-NEXT: movi d0, #0000000000000000
-; CHECK-GI-NEXT: dup v1.2d, x9
-; CHECK-GI-NEXT: mov x11, xzr
+; CHECK-GI-NEXT: mov x10, xzr
; CHECK-GI-NEXT: cmp w2, #16
; CHECK-GI-NEXT: b.lo .LBB6_9
; CHECK-GI-NEXT: // %bb.3: // %vector.ph
+; CHECK-GI-NEXT: mov w8, w1
; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
-; CHECK-GI-NEXT: xtn v2.2s, v1.2d
-; CHECK-GI-NEXT: and x8, x10, #0xc
+; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT: sxtb x8, w8
+; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
-; CHECK-GI-NEXT: and x11, x10, #0xfffffff0
-; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-NEXT: movi v6.2d, #0000000000000000
-; CHECK-GI-NEXT: mov x12, x0
+; CHECK-GI-NEXT: and x10, x9, #0xfffffff0
+; CHECK-GI-NEXT: dup v5.2d, x8
; CHECK-GI-NEXT: movi v7.2d, #0000000000000000
-; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
-; CHECK-GI-NEXT: and x13, x10, #0xfffffff0
-; CHECK-GI-NEXT: movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT: and x8, x9, #0xc
+; CHECK-GI-NEXT: mov x11, x0
+; CHECK-GI-NEXT: and x12, x9, #0xfffffff0
+; CHECK-GI-NEXT: xtn v16.2s, v5.2d
+; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-NEXT: .LBB6_4: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldr q18, [x12], #16
-; CHECK-GI-NEXT: subs x13, x13, #16
-; CHECK-GI-NEXT: ushll v19.8h, v18.8b, #0
-; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0
-; CHECK-GI-NEXT: ushll v20.4s, v19.4h, #0
-; CHECK-GI-NEXT: ushll2 v19.4s, v19.8h, #0
-; CHECK-GI-NEXT: ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT: ldr q17, [x11], #16
+; CHECK-GI-NEXT: subs x12, x12, #16
+; CHECK-GI-NEXT: ushll v18.8h, v17.8b, #0
+; CHECK-GI-NEXT: ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-NEXT: ushll v19.4s, v18.4h, #0
; CHECK-GI-NEXT: ushll2 v18.4s, v18.8h, #0
-; CHECK-GI-NEXT: mov d22, v20.d[1]
-; CHECK-GI-NEXT: mov d23, v19.d[1]
-; CHECK-GI-NEXT: mov d24, v21.d[1]
-; CHECK-GI-NEXT: mov d25, v18.d[1]
-; CHECK-GI-NEXT: smlal v0.2d, v2.2s, v20.2s
-; CHECK-GI-NEXT: smlal v4.2d, v2.2s, v19.2s
-; CHECK-GI-NEXT: smlal v6.2d, v2.2s, v21.2s
-; CHECK-GI-NEXT: smlal v16.2d, v2.2s, v18.2s
-; CHECK-GI-NEXT: smlal v3.2d, v2.2s, v22.2s
-; CHECK-GI-NEXT: smlal v5.2d, v2.2s, v23.2s
-; CHECK-GI-NEXT: smlal v7.2d, v2.2s, v24.2s
-; CHECK-GI-NEXT: smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT: ushll v20.4s, v17.4h, #0
+; CHECK-GI-NEXT: ushll2 v17.4s, v17.8h, #0
+; CHECK-GI-NEXT: mov d21, v19.d[1]
+; CHECK-GI-NEXT: mov d22, v18.d[1]
+; CHECK-GI-NEXT: mov d23, v20.d[1]
+; CHECK-GI-NEXT: mov d24, v17.d[1]
+; CHECK-GI-NEXT: smlal v0.2d, v16.2s, v19.2s
+; CHECK-GI-NEXT: smlal v2.2d, v16.2s, v18.2s
+; CHECK-GI-NEXT: smlal v4.2d, v16.2s, v20.2s
+; CHECK-GI-NEXT: smlal v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT: smlal v1.2d, v16.2s, v21.2s
+; CHECK-GI-NEXT: smlal v3.2d, v16.2s, v22.2s
+; CHECK-GI-NEXT: smlal v5.2d, v16.2s, v23.2s
+; CHECK-GI-NEXT: smlal v7.2d, v16.2s, v24.2s
; CHECK-GI-NEXT: b.ne .LBB6_4
; CHECK-GI-NEXT: // %bb.5: // %middle.block
-; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT: add v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT: cmp x10, x9
; CHECK-GI-NEXT: add v2.2d, v4.2d, v5.2d
-; CHECK-GI-NEXT: cmp x11, x10
; CHECK-GI-NEXT: add v3.2d, v6.2d, v7.2d
-; CHECK-GI-NEXT: add v4.2d, v16.2d, v17.2d
-; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT: add v2.2d, v3.2d, v4.2d
-; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT: add v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: b.ne .LBB6_8
; CHECK-GI-NEXT: // %bb.6:
@@ -1027,50 +946,54 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
; CHECK-GI-NEXT: .LBB6_8: // %vec.epilog.iter.check
; CHECK-GI-NEXT: cbz x8, .LBB6_12
; CHECK-GI-NEXT: .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT: mov w8, w1
; CHECK-GI-NEXT: mov v0.d[1], xzr
-; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
-; CHECK-GI-NEXT: mov x12, x11
-; CHECK-GI-NEXT: xtn v1.2s, v1.2d
-; CHECK-GI-NEXT: and x11, x10, #0xfffffffc
-; CHECK-GI-NEXT: sub x8, x12, x11
-; CHECK-GI-NEXT: add x12, x0, x12
+; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT: sxtb x8, w8
+; CHECK-GI-NEXT: mov x11, x10
+; CHECK-GI-NEXT: and x10, x9, #0xfffffffc
+; CHECK-GI-NEXT: dup v2.2d, x8
+; CHECK-GI-NEXT: sub x8, x11, x10
+; CHECK-GI-NEXT: add x11, x0, x11
+; CHECK-GI-NEXT: xtn v2.2s, v2.2d
; CHECK-GI-NEXT: .LBB6_10: // %vec.epilog.vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldr w13, [x12], #4
+; CHECK-GI-NEXT: ldr w12, [x11], #4
; CHECK-GI-NEXT: adds x8, x8, #4
-; CHECK-GI-NEXT: fmov s3, w13
-; CHECK-GI-NEXT: uxtb w13, w13
+; CHECK-GI-NEXT: fmov s3, w12
+; CHECK-GI-NEXT: uxtb w12, w12
; CHECK-GI-NEXT: mov b4, v3.b[2]
; CHECK-GI-NEXT: mov b5, v3.b[1]
; CHECK-GI-NEXT: mov b6, v3.b[3]
-; CHECK-GI-NEXT: fmov s3, w13
-; CHECK-GI-NEXT: fmov w14, s4
-; CHECK-GI-NEXT: fmov w15, s5
-; CHECK-GI-NEXT: fmov w16, s6
+; CHECK-GI-NEXT: fmov s3, w12
+; CHECK-GI-NEXT: fmov w13, s4
+; CHECK-GI-NEXT: fmov w14, s5
+; CHECK-GI-NEXT: fmov w15, s6
+; CHECK-GI-NEXT: uxtb w13, w13
; CHECK-GI-NEXT: uxtb w14, w14
; CHECK-GI-NEXT: uxtb w15, w15
-; CHECK-GI-NEXT: uxtb w16, w16
-; CHECK-GI-NEXT: fmov s4, w14
-; CHECK-GI-NEXT: mov v3.s[1], w15
-; CHECK-GI-NEXT: mov v4.s[1], w16
-; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v3.2s
-; CHECK-GI-NEXT: smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT: fmov s4, w13
+; CHECK-GI-NEXT: mov v3.s[1], w14
+; CHECK-GI-NEXT: mov v4.s[1], w15
+; CHECK-GI-NEXT: smlal v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT: smlal v1.2d, v2.2s, v4.2s
; CHECK-GI-NEXT: b.ne .LBB6_10
; CHECK-GI-NEXT: // %bb.11: // %vec.epilog.middle.block
-; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT: cmp x11, x10
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT: cmp x10, x9
; CHECK-GI-NEXT: addp d0, v0.2d
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: b.eq .LBB6_14
; CHECK-GI-NEXT: .LBB6_12: // %for.body.preheader
-; CHECK-GI-NEXT: sub x10, x10, x11
-; CHECK-GI-NEXT: add x11, x0, x11
+; CHECK-GI-NEXT: sxtb x11, w1
+; CHECK-GI-NEXT: sub x9, x9, x10
+; CHECK-GI-NEXT: add x10, x0, x10
; CHECK-GI-NEXT: .LBB6_13: // %for.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT: ldrb w8, [x11], #1
+; CHECK-GI-NEXT: ldrb w8, [x10], #1
; CHECK-GI-NEXT: fmov x12, d0
-; CHECK-GI-NEXT: subs x10, x10, #1
-; CHECK-GI-NEXT: madd x8, x8, x9, x12
+; CHECK-GI-NEXT: subs x9, x9, #1
+; CHECK-GI-NEXT: madd x8, x8, x11, x12
; CHECK-GI-NEXT: fmov d0, x8
; CHECK-GI-NEXT: b.ne .LBB6_13
; CHECK-GI-NEXT: .LBB6_14: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
index 35eafe8..f535e0f 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -68,13 +68,9 @@
# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4)
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2080
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
#
# CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0
@@ -83,14 +79,10 @@
# CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0
# CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1)
#
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4)
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
@@ -100,38 +92,26 @@
# ASM: str x29, [sp, #-16]!
# ASM-NEXT: .cfi_def_cfa_offset 16
# ASM-NEXT: .cfi_offset w29, -16
-# ASM-NEXT: sub sp, sp, #1024
-# ASM-NEXT: .cfi_def_cfa_offset 1040
-# ASM-NEXT: addvl sp, sp, #-1
-# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
-# ASM-NEXT: sub sp, sp, #1040
-# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT: addvl sp, sp, #-2
+# ASM-NEXT: sub sp, sp, #2064
+# ASM-NEXT: .cfi_def_cfa_offset 2080
+# ASM-NEXT: addvl sp, sp, #-3
# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
#
-# ASM: addvl sp, sp, #2
-# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT: add sp, sp, #1024
-# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
-# ASM-NEXT: addvl sp, sp, #1
-# ASM-NEXT: .cfi_def_cfa wsp, 1056
-# ASM-NEXT: add sp, sp, #1040
-# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM: add sp, sp, #2064
+# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+# ASM-NEXT: addvl sp, sp, #3
+# ASM-NEXT: .cfi_def_cfa wsp, 16
# ASM-NEXT: ldr x29, [sp], #16
# ASM-NEXT: .cfi_def_cfa_offset 0
# ASM-NEXT: .cfi_restore w29
# UNWINDINFO: DW_CFA_def_cfa_offset: +16
# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
-# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_offset: +2080
# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
#
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
-# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
# UNWINDINFO: DW_CFA_def_cfa_offset: +0
# UNWINDINFO-NEXT: DW_CFA_restore: reg29
@@ -270,13 +250,9 @@ body: |
# CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5)
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2080
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
#
# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
@@ -286,14 +262,10 @@ body: |
# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0
# CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23
#
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5)
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
@@ -303,38 +275,27 @@ body: |
# ASM: str x29, [sp, #-16]!
# ASM-NEXT: .cfi_def_cfa_offset 16
# ASM-NEXT: .cfi_offset w29, -16
-# ASM-NEXT: sub sp, sp, #1024
-# ASM-NEXT: .cfi_def_cfa_offset 1040
-# ASM-NEXT: addvl sp, sp, #-1
-# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
-# ASM-NEXT: sub sp, sp, #1040
-# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT: addvl sp, sp, #-2
+# ASM-NEXT: sub sp, sp, #2064
+# ASM-NEXT: .cfi_def_cfa_offset 2080
+# ASM-NEXT: addvl sp, sp, #-3
# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
#
-# ASM: addvl sp, sp, #2
-# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT: add sp, sp, #1024
-# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
-# ASM-NEXT: addvl sp, sp, #1
-# ASM-NEXT: .cfi_def_cfa wsp, 1056
-# ASM-NEXT: add sp, sp, #1040
-# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM: add sp, sp, #2064
+# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+# ASM-NEXT: addvl sp, sp, #3
+# ASM-NEXT: .cfi_def_cfa wsp, 16
# ASM-NEXT: ldr x29, [sp], #16
# ASM-NEXT: .cfi_def_cfa_offset 0
# ASM-NEXT: .cfi_restore w29
+# ASM-NEXT: ret
# UNWINDINFO: DW_CFA_def_cfa_offset: +16
# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
-# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_offset: +2080
# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
#
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
-# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
# UNWINDINFO: DW_CFA_def_cfa_offset: +0
# UNWINDINFO-NEXT: DW_CFA_restore: reg29
@@ -385,10 +346,8 @@ body: |
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
#
# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2
@@ -396,10 +355,8 @@ body: |
# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3
# CHECK-NEXT: STR_PXI $p0, $fp, -1
#
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5)
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -414,15 +371,11 @@ body: |
# ASM-NEXT: .cfi_def_cfa w29, 16
# ASM-NEXT: .cfi_offset w30, -8
# ASM-NEXT: .cfi_offset w29, -16
-# ASM-NEXT: sub sp, sp, #1024
-# ASM-NEXT: addvl sp, sp, #-1
-# ASM-NEXT: sub sp, sp, #1040
-# ASM-NEXT: addvl sp, sp, #-2
+# ASM-NEXT: sub sp, sp, #2064
+# ASM-NEXT: addvl sp, sp, #-3
#
-# ASM: addvl sp, sp, #2
-# ASM-NEXT: add sp, sp, #1024
-# ASM-NEXT: addvl sp, sp, #1
-# ASM-NEXT: add sp, sp, #1040
+# ASM: add sp, sp, #2064
+# ASM-NEXT: addvl sp, sp, #3
# ASM-NEXT: .cfi_def_cfa wsp, 16
# ASM-NEXT: ldp x29, x30, [sp], #16
# ASM-NEXT: .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
index 690a39d..c13dd33 100644
--- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -19,20 +19,16 @@ define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vec
; CHECK-LABEL: zpr_and_ppr_local:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: sub sp, sp, #1024
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: sub sp, sp, #1024
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #2048
+; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: add x8, sp, #2048
; CHECK-NEXT: str p0, [x8, #15, mul vl]
; CHECK-NEXT: add x8, sp, #1024
; CHECK-NEXT: str z0, [x8]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: add sp, sp, #1024
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: add sp, sp, #2048
+; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%ppr_local = alloca <vscale x 16 x i1>
@@ -62,20 +58,16 @@ define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #1024
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: sub sp, sp, #1024
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #2048
+; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: .cfi_def_cfa w29, 16
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: sub x8, x29, #1024
; CHECK-NEXT: str p0, [x29, #-1, mul vl]
; CHECK-NEXT: str z0, [x8, #-2, mul vl]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: add sp, sp, #1024
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: add sp, sp, #2048
+; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%ppr_local = alloca <vscale x 16 x i1>
@@ -103,17 +95,15 @@ define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch6
; CHECK-LABEL: fpr_and_ppr_local:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: sub sp, sp, #2064
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: sub sp, sp, #1040
; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: add x8, sp, #2064
; CHECK-NEXT: str p0, [x8, #7, mul vl]
; CHECK-NEXT: str d0, [sp, #1032]
-; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: add sp, sp, #2064
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: add sp, sp, #1040
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%ppr_local = alloca <vscale x 16 x i1>
@@ -144,17 +134,15 @@ define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aar
; CHECK: // %bb.0:
; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
-; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: sub sp, sp, #2064
; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: sub sp, sp, #1040
; CHECK-NEXT: .cfi_def_cfa w29, 16
; CHECK-NEXT: .cfi_offset w30, -8
; CHECK-NEXT: .cfi_offset w29, -16
; CHECK-NEXT: str p0, [x29, #-1, mul vl]
; CHECK-NEXT: str d0, [sp, #1032]
-; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: add sp, sp, #2064
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: add sp, sp, #1040
; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
; CHECK-NEXT: ret
%ppr_local = alloca <vscale x 16 x i1>
@@ -793,11 +781,8 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
; CHECK-LABEL: zpr_and_ppr_local_stack_probing:
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: sub sp, sp, #1024
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: str xzr, [sp]
-; CHECK-NEXT: sub sp, sp, #1824
-; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #2848
+; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: str xzr, [sp]
; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG
; CHECK-NEXT: .cfi_offset w29, -16
@@ -806,10 +791,8 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
; CHECK-NEXT: add x8, sp, #1824
; CHECK-NEXT: str z0, [x8]
; CHECK-NEXT: str x0, [sp]
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: add sp, sp, #1024
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: add sp, sp, #1824
+; CHECK-NEXT: add sp, sp, #2848
+; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
"probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible"
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
index becddae..b2ed8de 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
@@ -1,19 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | not grep ptrue
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index e86f747..37b5422 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
; Note: we use MIR test checks + stop after legalizer to prevent
; tests from being optimized out.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
index 44b12a9..61a6137 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
declare void @readsMem(ptr) #0
declare void @writesMem(ptr) #1
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
new file mode 100644
index 0000000..253a6ec
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
+; CHECK-LABEL: callbr_inline_asm:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_load_dword v0, v[0:1]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; %bb.1: ; %fallthrough
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[2:3], v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+; CHECK-NEXT: .LBB0_2: ; Inline asm indirect target
+; CHECK-NEXT: ; %indirect
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_dword v[4:5], v0
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %a = load i32, ptr %src, align 4
+ callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
+fallthrough:
+ store i32 %a, ptr %dst1, align 4
+ br label %ret
+indirect:
+ store i32 %a, ptr %dst2, align 4
+ br label %ret
+ret:
+ ret void
+}
+
+define void @callbr_self_loop(i1 %c) {
+; CHECK-LABEL: callbr_self_loop:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: .LBB1_1: ; %callbr
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_branch .LBB1_1
+; CHECK-NEXT: .LBB1_2: ; Inline asm indirect target
+; CHECK-NEXT: ; %callbr.target.ret
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ br label %callbr
+callbr:
+ callbr void asm "", "!i"() to label %callbr [label %ret]
+ret:
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 007e3f0..076a99f 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,6 +3,7 @@
declare void @foo(ptr)
declare i1 @bar(ptr)
+declare i32 @bar32(ptr)
define void @musttail_call_without_return_value(ptr %p) {
; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -28,6 +29,31 @@ bb.1:
ret void
}
+define void @musttail_call_without_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
+; CHECK: [[BB_0]]:
+; CHECK-NEXT: musttail call void @foo(ptr [[P]])
+; CHECK-NEXT: ret void
+; CHECK: [[BB_1:.*:]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %load = load i32, ptr %p, align 1
+ callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+ musttail call void @foo(ptr %p)
+ ret void
+
+bb.1:
+ ret void
+}
+
define i1 @musttail_call_with_return_value(ptr %p) {
; CHECK-LABEL: define i1 @musttail_call_with_return_value(
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -51,3 +77,28 @@ bb.0:
bb.1:
ret i1 %load
}
+
+define i32 @musttail_call_with_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT: callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT: to label %[[BB_0:.*]] [label %bb.1]
+; CHECK: [[BB_0]]:
+; CHECK-NEXT: [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
+; CHECK-NEXT: ret i32 [[RET]]
+; CHECK: [[BB_1:.*:]]
+; CHECK-NEXT: ret i32 [[LOAD]]
+;
+entry:
+ %load = load i32, ptr %p, align 1
+ callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+ %ret = musttail call i32 @bar32(ptr %p)
+ ret i32 %ret
+
+bb.1:
+ ret i32 %load
+}
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e2e43f..df63592 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,26 +36,60 @@ loop:
br label %loop
}
+define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+; IR-LABEL: @infinite_loop_callbr(
+; IR-NEXT: entry:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP:%.*]] []
+; IR: loop:
+; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR: TransitionBlock:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP]] []
+; IR: DummyReturnBlock:
+; IR-NEXT: ret void
+;
+entry:
+ callbr void asm "", ""() to label %loop []
+
+loop:
+ store volatile i32 999, ptr addrspace(1) %out, align 4
+ callbr void asm "", ""() to label %loop []
+}
+
define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loop_ret:
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT: s_cbranch_execz .LBB1_3
+; SI-NEXT: s_cbranch_execz .LBB2_3
; SI-NEXT: ; %bb.1: ; %loop.preheader
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB1_2: ; %loop
+; SI-NEXT: .LBB2_2: ; %loop
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB1_2
-; SI-NEXT: .LBB1_3: ; %UnifiedReturnBlock
+; SI-NEXT: s_cbranch_vccnz .LBB2_2
+; SI-NEXT: .LBB2_3: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loop_ret(
; IR-NEXT: entry:
@@ -81,44 +115,93 @@ return:
ret void
}
+define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_ret_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: ; %bb.1: ; %loop.preheader
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: .LBB3_2: ; Inline asm indirect target
+; SI-NEXT: ; %UnifiedReturnBlock
+; SI-NEXT: ; Label of block must be emitted
+; SI-NEXT: s_endpgm
+; IR-LABEL: @infinite_loop_ret_callbr(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT: [[COND32:%.*]] = zext i1 [[COND]] to i32
+; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND32]])
+; IR-NEXT: to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR: loop:
+; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR: TransitionBlock:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP]] []
+; IR: UnifiedReturnBlock:
+; IR-NEXT: ret void
+;
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %cond = icmp eq i32 %tmp, 1
+ %cond32 = zext i1 %cond to i32
+ callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
+
+loop:
+ store volatile i32 999, ptr addrspace(1) %out, align 4
+ callbr void asm "", ""() to label %loop []
+
+return:
+ ret void
+}
+
define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loops:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
; SI-NEXT: s_mov_b64 s[2:3], -1
-; SI-NEXT: s_cbranch_scc1 .LBB2_4
+; SI-NEXT: s_cbranch_scc1 .LBB4_4
; SI-NEXT: ; %bb.1:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x378
; SI-NEXT: s_and_b64 vcc, exec, -1
-; SI-NEXT: .LBB2_2: ; %loop2
+; SI-NEXT: .LBB4_2: ; %loop2
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccnz .LBB2_2
+; SI-NEXT: s_cbranch_vccnz .LBB4_2
; SI-NEXT: ; %bb.3: ; %Flow
; SI-NEXT: s_mov_b64 s[2:3], 0
-; SI-NEXT: .LBB2_4: ; %Flow2
+; SI-NEXT: .LBB4_4: ; %Flow2
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB2_7
+; SI-NEXT: s_cbranch_vccz .LBB4_7
; SI-NEXT: ; %bb.5:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
; SI-NEXT: s_and_b64 vcc, exec, 0
-; SI-NEXT: .LBB2_6: ; %loop1
+; SI-NEXT: .LBB4_6: ; %loop1
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_mov_b64 vcc, vcc
-; SI-NEXT: s_cbranch_vccz .LBB2_6
-; SI-NEXT: .LBB2_7: ; %DummyReturnBlock
+; SI-NEXT: s_cbranch_vccz .LBB4_6
+; SI-NEXT: .LBB4_7: ; %DummyReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loops(
; IR-NEXT: entry:
@@ -144,24 +227,78 @@ loop2:
br label %loop2
}
+define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loops_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: ; %bb.1: ; %loop1
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+; SI-NEXT: .LBB5_2: ; Inline asm indirect target
+; SI-NEXT: ; %loop2.preheader
+; SI-NEXT: ; Label of block must be emitted
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0x378
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_endpgm
+; IR-LABEL: @infinite_loops_callbr(
+; IR-NEXT: entry:
+; IR-NEXT: callbr void asm "", "r,!i"(i32 poison)
+; IR-NEXT: to label [[LOOP1:%.*]] [label %loop2]
+; IR: loop1:
+; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR: TransitionBlock:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP1]] []
+; IR: loop2:
+; IR-NEXT: store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]]
+; IR: TransitionBlock1:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[LOOP2:%.*]] []
+; IR: DummyReturnBlock:
+; IR-NEXT: ret void
+;
+entry:
+ callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2]
+
+loop1:
+ store volatile i32 999, ptr addrspace(1) %out, align 4
+ callbr void asm "", ""() to label %loop1 []
+
+loop2:
+ store volatile i32 888, ptr addrspace(1) %out, align 4
+ callbr void asm "", ""() to label %loop2 []
+}
+
define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
; SI-LABEL: infinite_loop_nest_ret:
; SI: ; %bb.0: ; %entry
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT: s_cbranch_execz .LBB3_5
+; SI-NEXT: s_cbranch_execz .LBB6_5
; SI-NEXT: ; %bb.1: ; %outer_loop.preheader
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 3, v0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT: .LBB3_2: ; %outer_loop
+; SI-NEXT: .LBB6_2: ; %outer_loop
; SI-NEXT: ; =>This Loop Header: Depth=1
-; SI-NEXT: ; Child Loop BB3_3 Depth 2
+; SI-NEXT: ; Child Loop BB6_3 Depth 2
; SI-NEXT: s_mov_b64 s[2:3], 0
-; SI-NEXT: .LBB3_3: ; %inner_loop
-; SI-NEXT: ; Parent Loop BB3_2 Depth=1
+; SI-NEXT: .LBB6_3: ; %inner_loop
+; SI-NEXT: ; Parent Loop BB6_2 Depth=1
; SI-NEXT: ; => This Inner Loop Header: Depth=2
; SI-NEXT: s_and_b64 s[8:9], exec, s[0:1]
; SI-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
@@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; SI-NEXT: s_cbranch_execnz .LBB3_3
+; SI-NEXT: s_cbranch_execnz .LBB6_3
; SI-NEXT: ; %bb.4: ; %loop.exit.guard
-; SI-NEXT: ; in Loop: Header=BB3_2 Depth=1
+; SI-NEXT: ; in Loop: Header=BB6_2 Depth=1
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: s_branch .LBB3_2
-; SI-NEXT: .LBB3_5: ; %UnifiedReturnBlock
+; SI-NEXT: s_branch .LBB6_2
+; SI-NEXT: .LBB6_5: ; %UnifiedReturnBlock
; SI-NEXT: s_endpgm
; IR-LABEL: @infinite_loop_nest_ret(
; IR-NEXT: entry:
@@ -212,4 +349,82 @@ return:
ret void
}
+define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_nest_ret_callbr:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: ; %bb.1: ; %outer_loop.preheader
+; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, -1
+; SI-NEXT: v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT: s_and_b64 s[0:1], exec, 0
+; SI-NEXT: s_branch .LBB7_3
+; SI-NEXT: .LBB7_2: ; %loop.exit.guard
+; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT: s_cbranch_vccnz .LBB7_5
+; SI-NEXT: .LBB7_3: ; %outer_loop
+; SI-NEXT: ; =>This Inner Loop Header: Depth=1
+; SI-NEXT: ;;#ASMSTART
+; SI-NEXT: ;;#ASMEND
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: s_mov_b64 s[2:3], -1
+; SI-NEXT: s_mov_b64 vcc, s[0:1]
+; SI-NEXT: s_cbranch_vccz .LBB7_2
+; SI-NEXT: ; %bb.4: ; %TransitionBlock.target.outer_loop
+; SI-NEXT: ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT: s_mov_b64 s[2:3], 0
+; SI-NEXT: s_branch .LBB7_2
+; SI-NEXT: .LBB7_5: ; Inline asm indirect target
+; SI-NEXT: ; %UnifiedReturnBlock
+; SI-NEXT: ; Label of block must be emitted
+; SI-NEXT: s_endpgm
+; IR-LABEL: @infinite_loop_nest_ret_callbr(
+; IR-NEXT: entry:
+; IR-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT: [[COND1:%.*]] = icmp ne i32 [[TMP]], 1
+; IR-NEXT: [[COND1_32:%.*]] = zext i1 [[COND1]] to i32
+; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND1_32]])
+; IR-NEXT: to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR: outer_loop:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[INNER_LOOP:%.*]] []
+; IR: inner_loop:
+; IR-NEXT: store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT: [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
+; IR-NEXT: [[COND3_32:%.*]] = zext i1 [[COND3]] to i32
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR: TransitionBlock:
+; IR-NEXT: callbr void asm "", "r,!i"(i32 [[COND3_32]])
+; IR-NEXT: to label [[INNER_LOOP]] [label %outer_loop]
+; IR: UnifiedReturnBlock:
+; IR-NEXT: ret void
+;
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %cond1 = icmp ne i32 %tmp, 1 ; avoid following BB optimizing away through the domination
+ %cond1_32 = zext i1 %cond1 to i32
+ callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return]
+
+outer_loop:
+ ; %cond2 = icmp eq i32 %tmp, 2
+ ; br i1 %cond2, label %outer_loop, label %inner_loop
+ callbr void asm "", ""() to label %inner_loop []
+
+inner_loop: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 999, ptr addrspace(1) %out, align 4
+ %cond3 = icmp eq i32 %tmp, 3
+ %cond3_32 = zext i1 %cond3 to i32
+ callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop]
+
+return:
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 34de1e4..01bcdad 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -3,15 +3,16 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
define void @nested_inf_loop(i1 %0, i1 %1) {
-; OPT-LABEL: @nested_inf_loop(
-; OPT-NEXT: BB:
-; OPT-NEXT: br label [[BB1:%.*]]
-; OPT: BB1:
-; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
-; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
-; OPT: infloop:
-; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
-; OPT: DummyReturnBlock:
+; OPT-LABEL: define void @nested_inf_loop(
+; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: br label %[[BB1:.*]]
+; OPT: [[BB1]]:
+; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]]
+; OPT-NEXT: br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]]
+; OPT: [[INFLOOP]]:
+; OPT-NEXT: br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT: [[DUMMYRETURNBLOCK]]:
; OPT-NEXT: ret void
;
; ISA-LABEL: nested_inf_loop:
@@ -63,3 +64,84 @@ BB4:
BB3:
br label %BB1
}
+
+define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
+; OPT-LABEL: define void @nested_inf_loop_callbr(
+; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; OPT-NEXT: [[BB:.*:]]
+; OPT-NEXT: callbr void asm "", ""()
+; OPT-NEXT: to label %[[BB1:.*]] []
+; OPT: [[BB1]]:
+; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP0]])
+; OPT-NEXT: to label %[[BB3:.*]] [label %BB2]
+; OPT: [[BB2:.*:]]
+; OPT-NEXT: callbr void asm "", ""()
+; OPT-NEXT: to label %[[BB4:.*]] []
+; OPT: [[BB4]]:
+; OPT-NEXT: br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT: [[TRANSITIONBLOCK]]:
+; OPT-NEXT: callbr void asm "", "r,!i"(i32 [[TMP1]])
+; OPT-NEXT: to label %[[BB3]] [label %BB4]
+; OPT: [[BB3]]:
+; OPT-NEXT: callbr void asm "", ""()
+; OPT-NEXT: to label %[[BB1]] []
+; OPT: [[DUMMYRETURNBLOCK]]:
+; OPT-NEXT: ret void
+;
+; ISA-LABEL: nested_inf_loop_callbr:
+; ISA: ; %bb.0: ; %BB
+; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISA-NEXT: ;;#ASMSTART
+; ISA-NEXT: ;;#ASMEND
+; ISA-NEXT: ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT: ; implicit-def: $sgpr4_sgpr5
+; ISA-NEXT: .LBB1_1: ; %BB1
+; ISA-NEXT: ; =>This Inner Loop Header: Depth=1
+; ISA-NEXT: ;;#ASMSTART
+; ISA-NEXT: ;;#ASMEND
+; ISA-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
+; ISA-NEXT: s_and_b64 s[8:9], s[4:5], exec
+; ISA-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; ISA-NEXT: .LBB1_2: ; %BB3
+; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT: ;;#ASMSTART
+; ISA-NEXT: ;;#ASMEND
+; ISA-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
+; ISA-NEXT: s_and_b64 s[8:9], s[6:7], exec
+; ISA-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
+; ISA-NEXT: s_branch .LBB1_1
+; ISA-NEXT: .LBB1_3: ; Inline asm indirect target
+; ISA-NEXT: ; %BB2
+; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT: ; Label of block must be emitted
+; ISA-NEXT: ;;#ASMSTART
+; ISA-NEXT: ;;#ASMEND
+; ISA-NEXT: s_mov_b64 s[6:7], -1
+; ISA-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; ISA-NEXT: s_cbranch_execz .LBB1_5
+; ISA-NEXT: ; %bb.4: ; %TransitionBlock.target.BB3
+; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT: s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT: .LBB1_5: ; %loop.exit.guard
+; ISA-NEXT: ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT: s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_and_b64 vcc, exec, s[6:7]
+; ISA-NEXT: s_mov_b64 s[6:7], 0
+; ISA-NEXT: s_cbranch_vccz .LBB1_2
+; ISA-NEXT: ; %bb.6: ; %DummyReturnBlock
+; ISA-NEXT: s_setpc_b64 s[30:31]
+BB:
+ callbr void asm "", ""() to label %BB1 []
+
+BB1:
+ callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2]
+
+BB2:
+ callbr void asm "", ""() to label %BB4 []
+
+BB4:
+ callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4]
+
+BB3:
+ callbr void asm "", ""() to label %BB1 []
+}
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 4cbe682..004c279 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
declare void @llvm.trap()
@@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
; CHECK-NEXT: s_mov_b64 s[2:3], -1
; CHECK-NEXT: s_trap 2
; CHECK-NEXT: s_branch .LBB0_4
-
-
+; UNIFY-LABEL: @kernel(
+; UNIFY-NEXT: entry:
+; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; UNIFY: if.then:
+; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT: br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]]
+; UNIFY: cond.false:
+; UNIFY-NEXT: call void @llvm.trap()
+; UNIFY-NEXT: unreachable
+; UNIFY: if.else:
+; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT: br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]]
+; UNIFY: if.then3:
+; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT: br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]]
+; UNIFY: cond.false.i8:
+; UNIFY-NEXT: call void @llvm.trap()
+; UNIFY-NEXT: unreachable
+; UNIFY: if.end6.sink.split:
+; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT: br label [[IF_END6]]
+; UNIFY: if.end6:
+; UNIFY-NEXT: ret void
+;
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%cmp = icmp eq i32 %n, 256
@@ -105,5 +130,129 @@ if.end6.sink.split:
if.end6:
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; UNIFY: {{.*}}
+
+define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
+; CHECK-LABEL: kernel_callbr:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dword s1, s[8:9], 0x10
+; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_cmpk_eq_i32 s1, 0x100
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; %bb.1: ; %if.then
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB1_2: ; %if.end6.sink.split
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, s0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_store_dword v0, v1, s[2:3]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: .LBB1_3: ; Inline asm indirect target
+; CHECK-NEXT: ; %UnifiedReturnBlock
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: .LBB1_4: ; Inline asm indirect target
+; CHECK-NEXT: ; %if.else
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 10, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; %bb.5: ; %if.then3
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_branch .LBB1_2
+; CHECK-NEXT: .LBB1_6: ; Inline asm indirect target
+; CHECK-NEXT: ; %cond.false.i8
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: .LBB1_7: ; Inline asm indirect target
+; CHECK-NEXT: ; %cond.false
+; CHECK-NEXT: ; Label of block must be emitted
+; CHECK-NEXT: s_trap 2
+; CHECK-NEXT: ; divergent unreachable
+; CHECK-NEXT: s_branch .LBB1_3
+; UNIFY-LABEL: @kernel_callbr(
+; UNIFY-NEXT: entry:
+; UNIFY-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT: [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT: [[CMP32:%.*]] = zext i1 [[CMP]] to i32
+; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP32]])
+; UNIFY-NEXT: to label [[IF_THEN:%.*]] [label %if.else]
+; UNIFY: if.then:
+; UNIFY-NEXT: [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT: [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32
+; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_32]])
+; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false]
+; UNIFY: cond.false:
+; UNIFY-NEXT: call void @llvm.trap()
+; UNIFY-NEXT: unreachable
+; UNIFY: if.else:
+; UNIFY-NEXT: [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT: [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32
+; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP2_32]])
+; UNIFY-NEXT: to label [[IF_THEN3:%.*]] [label %if.end6]
+; UNIFY: if.then3:
+; UNIFY-NEXT: [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT: [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32
+; UNIFY-NEXT: callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]])
+; UNIFY-NEXT: to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8]
+; UNIFY: cond.false.i8:
+; UNIFY-NEXT: call void @llvm.trap()
+; UNIFY-NEXT: unreachable
+; UNIFY: if.end6.sink.split:
+; UNIFY-NEXT: [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT: store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT: callbr void asm "", ""()
+; UNIFY-NEXT: to label [[IF_END6:%.*]] []
+; UNIFY: if.end6:
+; UNIFY-NEXT: ret void
+;
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %cmp = icmp eq i32 %n, 256
+ %cmp32 = zext i1 %cmp to i32
+ callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else]
+
+if.then:
+ %cmp1 = icmp eq i32 %a, 0
+ %cmp1_32 = zext i1 %cmp1 to i32
+ callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false]
+
+cond.false:
+ call void @llvm.trap()
+ unreachable
+
+if.else:
+ %cmp2 = icmp ult i32 %tid, 10
+ %cmp2_32 = zext i1 %cmp2 to i32
+ callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6]
+
+if.then3:
+ %cmp1.i7 = icmp eq i32 %a, 0
+ %cmp1.i7_32 = zext i1 %cmp1.i7 to i32
+ callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8]
+
+cond.false.i8:
+ call void @llvm.trap()
+ unreachable
+
+if.end6.sink.split:
+ %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid
+ store i32 %a, ptr addrspace(1) %x1, align 4
+ callbr void asm "", ""() to label %if.end6 []
+
+if.end6:
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll
index 50666be..684dc1a 100644
--- a/llvm/test/CodeGen/AMDGPU/update-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll
@@ -37,3 +37,42 @@ n28: ; preds = %.loopexit, %n28
n31: ; preds =
ret void
}
+
+define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 {
+; IR-LABEL: @_amdgpu_ps_main_callbr(
+; IR-NEXT: .entry:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[DOTLOOPEXIT:%.*]] []
+; IR: .loopexit:
+; IR-NEXT: callbr void asm "", ""()
+; IR-NEXT: to label [[N28:%.*]] []
+; IR: n28:
+; IR-NEXT: [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ]
+; IR-NEXT: [[N29]] = fadd float [[DOT01]], 1.000000e+00
+; IR-NEXT: [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00
+; IR-NEXT: [[N30_32:%.*]] = zext i1 [[N30]] to i32
+; IR-NEXT: br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR: TransitionBlock:
+; IR-NEXT: callbr void asm "", "r,!i"(i32 [[N30_32]])
+; IR-NEXT: to label [[DOTLOOPEXIT]] [label %n28]
+; IR: n31:
+; IR-NEXT: ret void
+; IR: DummyReturnBlock:
+; IR-NEXT: ret void
+;
+.entry:
+ callbr void asm "", ""() to label %.loopexit []
+
+.loopexit: ; preds = %n28, %.entry
+ callbr void asm "", ""() to label %n28 []
+
+n28: ; preds = %.loopexit, %n28
+ %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ]
+ %n29 = fadd float %.01, 1.0
+ %n30 = fcmp ogt float %n29, 4.000000e+00
+ %n30.32 = zext i1 %n30 to i32
+ callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28]
+
+n31: ; preds =
+ ret void
+}
diff --git a/llvm/test/CodeGen/BPF/bpf_trap.ll b/llvm/test/CodeGen/BPF/bpf_trap.ll
new file mode 100644
index 0000000..ab8df5f
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/bpf_trap.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s | FileCheck %s
+;
+target triple = "bpf"
+
+define i32 @test(i8 %x) {
+entry:
+ %0 = and i8 %x, 3
+ switch i8 %0, label %default.unreachable4 [
+ i8 0, label %return
+ i8 1, label %sw.bb1
+ i8 2, label %sw.bb2
+ i8 3, label %sw.bb3
+ ]
+
+sw.bb1: ; preds = %entry
+ br label %return
+
+sw.bb2: ; preds = %entry
+ br label %return
+
+sw.bb3: ; preds = %entry
+ br label %return
+
+default.unreachable4: ; preds = %entry
+ unreachable
+
+return: ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1
+ %retval.0 = phi i32 [ 12, %sw.bb1 ], [ 43, %sw.bb2 ], [ 54, %sw.bb3 ], [ 32, %entry ]
+ ret i32 %retval.0
+}
+
+; CHECK-NOT: __bpf_trap
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
index 48ec98c..8e08e1e 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
@@ -5,40 +5,10 @@
define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
; CHECK-LABEL: minnum_v8f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvld $xr0, $a2, 0
-; CHECK-NEXT: xvld $xr1, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5
-; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5
-; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4
-; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4
-; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6
-; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6
-; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7
-; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7
-; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 48
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1
-; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1
-; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0
-; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0
-; CHECK-NEXT: fmin.s $fa4, $fa5, $fa4
-; CHECK-NEXT: vextrins.w $vr4, $vr2, 16
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2
-; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2
-; CHECK-NEXT: fmin.s $fa2, $fa5, $fa2
-; CHECK-NEXT: vextrins.w $vr4, $vr2, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3
-; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0
-; CHECK-NEXT: vextrins.w $vr4, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2
-; CHECK-NEXT: xvst $xr4, $a0, 0
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: xvfmin.s $xr0, $xr0, $xr1
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %x
@@ -51,23 +21,9 @@ entry:
define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
; CHECK-LABEL: minnum_v4f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvld $xr0, $a2, 0
-; CHECK-NEXT: xvld $xr1, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3
-; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3
-; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2
-; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2
-; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2
-; CHECK-NEXT: fmin.d $fa3, $fa4, $fa3
-; CHECK-NEXT: vextrins.d $vr3, $vr2, 16
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1
-; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1
-; CHECK-NEXT: fmin.d $fa2, $fa4, $fa2
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0
-; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0
-; CHECK-NEXT: vextrins.d $vr0, $vr2, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: xvfmin.d $xr0, $xr0, $xr1
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -81,40 +37,10 @@ entry:
define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
; CHECK-LABEL: maxnum_v8f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvld $xr0, $a2, 0
-; CHECK-NEXT: xvld $xr1, $a1, 0
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5
-; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5
-; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2
-; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4
-; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4
-; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6
-; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6
-; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7
-; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7
-; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 48
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1
-; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1
-; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0
-; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0
-; CHECK-NEXT: fmax.s $fa4, $fa5, $fa4
-; CHECK-NEXT: vextrins.w $vr4, $vr2, 16
-; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2
-; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2
-; CHECK-NEXT: fmax.s $fa2, $fa5, $fa2
-; CHECK-NEXT: vextrins.w $vr4, $vr2, 32
-; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3
-; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0
-; CHECK-NEXT: vextrins.w $vr4, $vr0, 48
-; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2
-; CHECK-NEXT: xvst $xr4, $a0, 0
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: xvfmax.s $xr0, $xr0, $xr1
+; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <8 x float>, ptr %x
@@ -127,23 +53,9 @@ entry:
define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
; CHECK-LABEL: maxnum_v4f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xvld $xr0, $a2, 0
-; CHECK-NEXT: xvld $xr1, $a1, 0
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3
-; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3
-; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2
-; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2
-; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2
-; CHECK-NEXT: fmax.d $fa3, $fa4, $fa3
-; CHECK-NEXT: vextrins.d $vr3, $vr2, 16
-; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1
-; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1
-; CHECK-NEXT: fmax.d $fa2, $fa4, $fa2
-; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0
-; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0
-; CHECK-NEXT: vextrins.d $vr0, $vr2, 16
-; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: xvfmax.d $xr0, $xr0, $xr1
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
index 27ecb75..c173092 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
@@ -5,24 +5,10 @@
define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
; CHECK-LABEL: minnum_v4f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vld $vr0, $a2, 0
-; CHECK-NEXT: vld $vr1, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1
-; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2
-; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0
-; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2
-; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3
-; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: vst $vr3, $a0, 0
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: vfmin.s $vr0, $vr0, $vr1
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %x
@@ -35,15 +21,9 @@ entry:
define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
; CHECK-LABEL: minnum_v2f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vld $vr0, $a2, 0
-; CHECK-NEXT: vld $vr1, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1
-; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0
-; CHECK-NEXT: vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: vfmin.d $vr0, $vr0, $vr1
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
@@ -57,24 +37,10 @@ entry:
define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
; CHECK-LABEL: maxnum_v4f32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vld $vr0, $a2, 0
-; CHECK-NEXT: vld $vr1, $a1, 0
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1
-; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1
-; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2
-; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0
-; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0
-; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2
-; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2
-; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT: vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3
-; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0
-; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT: vst $vr3, $a0, 0
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: vfmax.s $vr0, $vr0, $vr1
+; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
%v0 = load <4 x float>, ptr %x
@@ -87,15 +53,9 @@ entry:
define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
; CHECK-LABEL: maxnum_v2f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vld $vr0, $a2, 0
-; CHECK-NEXT: vld $vr1, $a1, 0
-; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1
-; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1
-; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2
-; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0
-; CHECK-NEXT: vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT: vld $vr0, $a1, 0
+; CHECK-NEXT: vld $vr1, $a2, 0
+; CHECK-NEXT: vfmax.d $vr0, $vr0, $vr1
; CHECK-NEXT: vst $vr0, $a0, 0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
new file mode 100644
index 0000000..d3853e2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s 2>&1 | FileCheck %s
+
+; Test that we get a clear error message when using an unsupported syncscope.
+
+; CHECK: NVPTX backend does not support syncscope "agent"
+; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device
+define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) {
+ %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic
+ %value = extractvalue { i32, i1 } %result, 0
+ ret i32 %value
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
new file mode 100644
index 0000000..bf0a2e5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh < %s | FileCheck %s
+
+; CHECK-LABEL: .section .llvm_stackmaps
+; CHECK-NEXT: __LLVM_StackMaps:
+; Header
+; CHECK-NEXT: .byte 3
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 0
+; Num Functions
+; CHECK-NEXT: .word 1
+; Num LargeConstants
+; CHECK-NEXT: .word 0
+; Num Callsites
+; CHECK-NEXT: .word 1
+
+; Functions and stack size
+; CHECK-NEXT: .quad liveArgs
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .quad 1
+
+; Spilled stack map values.
+;
+; Verify 3 stack map entries.
+;
+; CHECK-LABEL: .word .L{{.*}}-liveArgs
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .half 25
+;
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
+; CHECK: .byte 3
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 2
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+define void @liveArgs(double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) {
+entry:
+ call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29)
+ ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
index c50a0fb3..320a3aa 100644
--- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
@@ -286,8 +286,8 @@ define void @liveConstant() {
; CHECK-NEXT: .half 0
; CHECK-NEXT: .half 28
;
-; Check that at least one is a spilled entry from RBP.
-; Location: Indirect RBP + ...
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
; CHECK: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .half 8
@@ -307,7 +307,7 @@ entry:
; CHECK-NEXT: .half 0
; 1 location
; CHECK-NEXT: .half 1
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
; CHECK-NEXT: .byte 2
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .half 8
@@ -320,14 +320,14 @@ entry:
; CHECK-NEXT: .half 0
; 2 locations
; CHECK-NEXT: .half 2
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
; CHECK-NEXT: .byte 2
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .half 8
; CHECK-NEXT: .half 2
; CHECK-NEXT: .half 0
; CHECK-NEXT: .word
-; Loc 1: Direct RBP - ofs
+; Loc 1: Direct SP + ofs
; CHECK-NEXT: .byte 2
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .half 8
diff --git a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
index 73c46b1..c9b2968 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
@@ -10,6 +10,7 @@
; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0
; CHECK-DAG: %[[#Half:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32
; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#Half]]
; CHECK-DAG: %[[#Void:]] = OpTypeVoid
; CHECK-DAG: %[[#PtrInt8:]] = OpTypePointer CrossWorkgroup %[[#Int8:]]
@@ -17,12 +18,20 @@
; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0
; CHECK-DAG: %[[#PtrInt64:]] = OpTypePointer CrossWorkgroup %[[#Int64]]
; CHECK-DAG: %[[#BarType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt64]] %[[#Struct]]
+; CHECK-DAG: %[[#BazType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt8]] %[[#Struct]] %[[#Int8]] %[[#Struct]] %[[#Float]] %[[#Struct]]
; CHECK: OpFunction %[[#Void]] None %[[#FooType]]
; CHECK: OpFunctionParameter %[[#PtrInt8]]
; CHECK: OpFunctionParameter %[[#Struct]]
; CHECK: OpFunction %[[#Void]] None %[[#BarType]]
; CHECK: OpFunctionParameter %[[#PtrInt64]]
; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunction %[[#Void]] None %[[#BazType]]
+; CHECK: OpFunctionParameter %[[#PtrInt8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Int8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Float]]
+; CHECK: OpFunctionParameter %[[#Struct]]
%t_half = type { half }
@@ -38,4 +47,9 @@ entry:
ret void
}
+define spir_kernel void @baz(ptr addrspace(1) %a, %t_half %b, i8 %c, %t_half %d, float %e, %t_half %f) {
+entry:
+ ret void
+}
+
declare spir_func %t_half @_Z29__spirv_SpecConstantComposite(half)
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 8007d9d..c311ab8 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %edx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB5_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: .LBB5_2:
-; X86-NEXT: andl 4(%eax), %esi
-; X86-NEXT: andl (%eax), %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: setne %al
-; X86-NEXT: popl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $32, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_ne_i64:
@@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB6_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB6_2:
-; X86-NEXT: movl (%edx), %ecx
-; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: andl %esi, %ebx
-; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: andl %eax, %ebp
-; X86-NEXT: xorl %esi, %edi
-; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: orl %ebx, %ebp
-; X86-NEXT: setne %al
-; X86-NEXT: movl %ecx, (%edx)
-; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btcl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i64:
@@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %esi
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB7_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: .LBB7_2:
-; X86-NEXT: movl (%edx), %eax
-; X86-NEXT: movl 4(%edx), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: notl %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: andl %esi, %ebp
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %ecx, %edi
-; X86-NEXT: andl %eax, %esi
-; X86-NEXT: orl %ebx, %ebp
-; X86-NEXT: sete %al
-; X86-NEXT: movl %esi, (%edx)
-; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i64:
@@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB8_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB8_2:
-; X86-NEXT: movl (%edx), %ecx
-; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: andl %esi, %ebx
-; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: andl %eax, %ebp
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: orl %ebx, %ebp
-; X86-NEXT: setne %al
-; X86-NEXT: movl %ecx, (%edx)
-; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btsl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i64:
@@ -419,52 +353,47 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: shll %cl, %esi
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: shll %cl, %eax
; X86-NEXT: testb $32, %cl
; X86-NEXT: je .LBB9_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl $0, %edx
; X86-NEXT: .LBB9_2:
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: notl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %esi
+; X86-NEXT: notl %edx
; X86-NEXT: je .LBB9_4
; X86-NEXT: # %bb.3:
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB9_4:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 4(%ecx), %ecx
-; X86-NEXT: andl %ecx, %edx
-; X86-NEXT: andl %ecx, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl (%edi), %ecx
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: andl %ecx, %ebp
-; X86-NEXT: orl %esi, %ebp
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %ebp, (%edi)
-; X86-NEXT: movl %ebx, 4(%edi)
-; X86-NEXT: sete %al
+; X86-NEXT: andl 4(%ebx), %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: andl (%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $32, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl (%ebx,%eax), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setae %al
+; X86-NEXT: movl %edx, (%ebx)
+; X86-NEXT: movl %esi, 4(%ebx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i64:
@@ -516,101 +445,25 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $48, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, (%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %esi
-; X86-NEXT: movl 24(%esp,%esi), %edi
-; X86-NEXT: movl 28(%esp,%esi), %eax
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl 16(%esp,%esi), %edx
-; X86-NEXT: movl 20(%esp,%esi), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: andl 8(%ebx), %edi
-; X86-NEXT: andl (%ebx), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: andl 12(%ebx), %eax
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $96, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
-; SSE-LABEL: test_ne_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %rsi, %rax
-; SSE-NEXT: andq 8(%rdi), %rdx
-; SSE-NEXT: andq (%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: setne %al
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: test_ne_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %edx
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %rdx, %rsi
-; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rdx, %rsi
-; AVX2-NEXT: cmovneq %rax, %rdx
-; AVX2-NEXT: andq 8(%rdi), %rsi
-; AVX2-NEXT: andq (%rdi), %rdx
-; AVX2-NEXT: orq %rsi, %rdx
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_ne_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %eax
-; AVX512-NEXT: xorl %edx, %edx
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shlxq %rcx, %rax, %rax
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rax, %rdx
-; AVX512-NEXT: cmovneq %rsi, %rax
-; AVX512-NEXT: andq 8(%rdi), %rdx
-; AVX512-NEXT: andq (%rdi), %rax
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: retq
+; X64-LABEL: test_ne_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $96, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: movl (%rdi,%rax), %eax
+; X64-NEXT: btl %esi, %eax
+; X64-NEXT: setb %al
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -623,124 +476,33 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %esi
-; X86-NEXT: movl 60(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %edi
-; X86-NEXT: movl 52(%esp,%eax), %ebx
-; X86-NEXT: shldl %cl, %ebx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl 8(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: andl %edi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 12(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %esi, 12(%eax)
-; X86-NEXT: movl %edi, (%eax)
-; X86-NEXT: movl %ebx, 4(%eax)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btcl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: complement_ne_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %edx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rdx, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: andq %rsi, %r8
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: andq %rdx, %r9
-; SSE-NEXT: xorq %rcx, %rsi
-; SSE-NEXT: xorq %rax, %rdx
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: setne %al
-; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: movq %rsi, 8(%rdi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: complement_ne_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: movl $1, %edx
-; AVX-NEXT: xorl %esi, %esi
-; AVX-NEXT: shldq %cl, %rdx, %rsi
-; AVX-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %rdx, %rsi
-; AVX-NEXT: cmovneq %rax, %rdx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: movq %rcx, %r8
-; AVX-NEXT: andq %rsi, %r8
-; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: andq %rdx, %r9
-; AVX-NEXT: xorq %rcx, %rsi
-; AVX-NEXT: xorq %rax, %rdx
-; AVX-NEXT: orq %r8, %r9
-; AVX-NEXT: setne %al
-; AVX-NEXT: movq %rdx, (%rdi)
-; AVX-NEXT: movq %rsi, 8(%rdi)
-; AVX-NEXT: retq
+; X64-LABEL: complement_ne_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btcl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -755,124 +517,33 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %edx
-; X86-NEXT: movl 60(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %esi
-; X86-NEXT: movl 52(%esp,%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %edx
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl 8(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl (%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: movl 4(%ebx), %ebx
-; X86-NEXT: andl %ebx, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl %ebx, %ecx
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl 8(%ebp), %edi
-; X86-NEXT: movl %edx, 8(%edi)
-; X86-NEXT: movl %eax, 12(%edi)
-; X86-NEXT: movl %esi, (%edi)
-; X86-NEXT: movl %ecx, 4(%edi)
-; X86-NEXT: sete %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_eq_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %edx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rdx, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: andq %rsi, %r8
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: andq %rdx, %r9
-; SSE-NEXT: notq %rdx
-; SSE-NEXT: andq %rcx, %rsi
-; SSE-NEXT: andq %rax, %rdx
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: sete %al
-; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: movq %rsi, 8(%rdi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: reset_eq_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: movl $1, %edx
-; AVX-NEXT: xorl %esi, %esi
-; AVX-NEXT: shldq %cl, %rdx, %rsi
-; AVX-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %rdx, %rsi
-; AVX-NEXT: cmovneq %rax, %rdx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: andnq %rcx, %rsi, %r8
-; AVX-NEXT: andq %rsi, %rcx
-; AVX-NEXT: andnq %rax, %rdx, %rsi
-; AVX-NEXT: andq %rdx, %rax
-; AVX-NEXT: orq %rcx, %rax
-; AVX-NEXT: sete %al
-; AVX-NEXT: movq %rsi, (%rdi)
-; AVX-NEXT: movq %r8, 8(%rdi)
-; AVX-NEXT: retq
+; X64-LABEL: reset_eq_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setae %al
+; X64-NEXT: btrl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -888,124 +559,33 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %esi
-; X86-NEXT: movl 60(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %edi
-; X86-NEXT: movl 52(%esp,%eax), %ebx
-; X86-NEXT: shldl %cl, %ebx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl 8(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: andl %edi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 12(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %esi, 12(%eax)
-; X86-NEXT: movl %edi, (%eax)
-; X86-NEXT: movl %ebx, 4(%eax)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btsl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: set_ne_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %edx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rdx, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: andq %rsi, %r8
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: andq %rdx, %r9
-; SSE-NEXT: orq %rcx, %rsi
-; SSE-NEXT: orq %rax, %rdx
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: setne %al
-; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: movq %rsi, 8(%rdi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: set_ne_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: movl $1, %edx
-; AVX-NEXT: xorl %esi, %esi
-; AVX-NEXT: shldq %cl, %rdx, %rsi
-; AVX-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %rdx, %rsi
-; AVX-NEXT: cmovneq %rax, %rdx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: movq %rcx, %r8
-; AVX-NEXT: andq %rsi, %r8
-; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: andq %rdx, %r9
-; AVX-NEXT: orq %rcx, %rsi
-; AVX-NEXT: orq %rax, %rdx
-; AVX-NEXT: orq %r8, %r9
-; AVX-NEXT: setne %al
-; AVX-NEXT: movq %rdx, (%rdi)
-; AVX-NEXT: movq %rsi, 8(%rdi)
-; AVX-NEXT: retq
+; X64-LABEL: set_ne_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btsl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -1026,9 +606,9 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movzbl 16(%ebp), %eax
+; X86-NEXT: subl $96, %esp
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movzbl 16(%ebp), %ebx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1037,25 +617,30 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrb $3, %dl
-; X86-NEXT: andb $12, %dl
-; X86-NEXT: negb %dl
-; X86-NEXT: movsbl %dl, %esi
-; X86-NEXT: movl 64(%esp,%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 68(%esp,%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esp,%esi), %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 64(%esp,%eax), %edx
+; X86-NEXT: movl 68(%esp,%eax), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 76(%esp,%esi), %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movl 72(%esp,%esi), %ebx
+; X86-NEXT: movl 76(%esp,%esi), %esi
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: shldl %cl, %ebx, %esi
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: notl %edi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1063,72 +648,59 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movl 36(%esp,%ecx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: movl 40(%esp,%ecx), %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: andl 8(%eax), %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: notl %esi
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl 44(%esp,%eax), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %edi, %esi
; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl 12(%ecx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl 4(%ecx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl 100(%esp,%ecx), %edi
-; X86-NEXT: movl 104(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl 108(%esp,%ebx), %ebx
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl 96(%esp,%ebx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: andl 12(%ecx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl 32(%esp,%eax), %edx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: andl (%eax), %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 4(%ecx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: andl $96, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl (%ecx,%eax), %eax
+; X86-NEXT: btl %esi, %eax
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %edi, 8(%ecx)
-; X86-NEXT: movl %esi, 12(%ecx)
-; X86-NEXT: movl %eax, (%ecx)
-; X86-NEXT: movl %edx, 4(%ecx)
-; X86-NEXT: sete %al
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: setae %al
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -1151,22 +723,20 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-NEXT: testb $64, %cl
; SSE-NEXT: cmovneq %rsi, %r8
; SSE-NEXT: cmovneq %r9, %rsi
+; SSE-NEXT: notq %r8
; SSE-NEXT: cmovneq %rax, %rdx
; SSE-NEXT: cmovneq %r9, %rax
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq 8(%rdi), %r9
-; SSE-NEXT: movq %r9, %r10
-; SSE-NEXT: andq %r8, %r10
-; SSE-NEXT: notq %r8
-; SSE-NEXT: movq %rcx, %r11
-; SSE-NEXT: andq %rsi, %r11
; SSE-NEXT: notq %rsi
-; SSE-NEXT: andq %r9, %r8
+; SSE-NEXT: andq 8(%rdi), %r8
; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq %rcx, %rsi
+; SSE-NEXT: andq (%rdi), %rsi
; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: orq %r10, %r11
-; SSE-NEXT: sete %al
+; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: andl $96, %eax
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: movl (%rdi,%rax), %eax
+; SSE-NEXT: btl %ecx, %eax
+; SSE-NEXT: setae %al
; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: retq
@@ -1174,63 +744,63 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; AVX2-LABEL: init_eq_i128:
; AVX2: # %bb.0:
; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %esi
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: shldq %cl, %rsi, %rax
-; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: movl $1, %eax
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: shldq %cl, %rax, %rsi
; AVX2-NEXT: movl %edx, %edx
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: shldq %cl, %rdx, %r8
; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: shldq %cl, %rdx, %r9
-; AVX2-NEXT: shlxq %rcx, %rsi, %rsi
+; AVX2-NEXT: shlxq %rcx, %rax, %rax
; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rsi, %rax
-; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT: cmovneq %rcx, %r9
-; AVX2-NEXT: cmovneq %r8, %rcx
-; AVX2-NEXT: movq (%rdi), %rdx
-; AVX2-NEXT: movq 8(%rdi), %r8
-; AVX2-NEXT: andnq %r8, %rax, %r10
-; AVX2-NEXT: andq %rax, %r8
-; AVX2-NEXT: andnq %rdx, %rsi, %r11
-; AVX2-NEXT: andq %rsi, %rdx
-; AVX2-NEXT: orq %r9, %r10
-; AVX2-NEXT: orq %rcx, %r11
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: movq %r11, (%rdi)
-; AVX2-NEXT: movq %r10, 8(%rdi)
+; AVX2-NEXT: cmovneq %rax, %rsi
+; AVX2-NEXT: cmovneq %r9, %rax
+; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX2-NEXT: cmovneq %rdx, %r8
+; AVX2-NEXT: cmovneq %r9, %rdx
+; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: andnq (%rdi), %rax, %r8
+; AVX2-NEXT: orq %rdx, %r8
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $96, %eax
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: movl (%rdi,%rax), %eax
+; AVX2-NEXT: btl %ecx, %eax
+; AVX2-NEXT: setae %al
+; AVX2-NEXT: movq %r8, (%rdi)
+; AVX2-NEXT: movq %rsi, 8(%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: init_eq_i128:
; AVX512: # %bb.0:
; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: movl $1, %esi
+; AVX512-NEXT: movl $1, %eax
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: shldq %cl, %rax, %rsi
; AVX512-NEXT: xorl %r8d, %r8d
-; AVX512-NEXT: shldq %cl, %rsi, %r8
-; AVX512-NEXT: shlxq %rcx, %rsi, %rsi
+; AVX512-NEXT: shlxq %rcx, %rax, %rax
; AVX512-NEXT: movl %edx, %edx
; AVX512-NEXT: xorl %r9d, %r9d
; AVX512-NEXT: shldq %cl, %rdx, %r9
; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rsi, %r8
; AVX512-NEXT: cmovneq %rax, %rsi
-; AVX512-NEXT: shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT: cmovneq %rcx, %r9
-; AVX512-NEXT: cmovneq %rax, %rcx
-; AVX512-NEXT: movq (%rdi), %rax
-; AVX512-NEXT: movq 8(%rdi), %rdx
-; AVX512-NEXT: andnq %rdx, %r8, %r10
-; AVX512-NEXT: andq %r8, %rdx
-; AVX512-NEXT: andnq %rax, %rsi, %r8
-; AVX512-NEXT: andq %rsi, %rax
-; AVX512-NEXT: orq %r9, %r10
-; AVX512-NEXT: orq %rcx, %r8
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: sete %al
+; AVX512-NEXT: cmovneq %r8, %rax
+; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX512-NEXT: cmovneq %rdx, %r9
+; AVX512-NEXT: cmovneq %r8, %rdx
+; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi
+; AVX512-NEXT: orq %r9, %rsi
+; AVX512-NEXT: andnq (%rdi), %rax, %r8
+; AVX512-NEXT: orq %rdx, %r8
+; AVX512-NEXT: movl %ecx, %eax
+; AVX512-NEXT: andl $96, %eax
+; AVX512-NEXT: shrl $3, %eax
+; AVX512-NEXT: movl (%rdi,%rax), %eax
+; AVX512-NEXT: btl %ecx, %eax
+; AVX512-NEXT: setae %al
; AVX512-NEXT: movq %r8, (%rdi)
-; AVX512-NEXT: movq %r10, 8(%rdi)
+; AVX512-NEXT: movq %rsi, 8(%rdi)
; AVX512-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
@@ -1252,344 +822,25 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $224, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edx), %eax
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edx), %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 52(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl 40(%ebx), %eax
-; X86-NEXT: andl 8(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 56(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 24(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: andl 44(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 12(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 60(%edi), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 28(%edi), %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%edx), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: negl %edx
-; X86-NEXT: movl 192(%esp,%edx), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl 32(%ebx), %ecx
-; X86-NEXT: andl (%ebx), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: andl 16(%ebx), %edi
-; X86-NEXT: andl 48(%ebx), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 36(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 20(%ebx), %ecx
-; X86-NEXT: andl 52(%ebx), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: andl $60, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
-; SSE-LABEL: test_ne_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rbx
-; SSE-NEXT: movq -48(%rsp,%rbx), %rdx
-; SSE-NEXT: movq -40(%rsp,%rbx), %r14
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq -16(%rsp,%rbx), %r11
-; SSE-NEXT: movq -8(%rsp,%rbx), %r10
-; SSE-NEXT: shldq %cl, %r11, %r10
-; SSE-NEXT: movq -32(%rsp,%rbx), %r9
-; SSE-NEXT: movq -24(%rsp,%rbx), %r15
-; SSE-NEXT: movq %r15, %r8
-; SSE-NEXT: shldq %cl, %r9, %r8
-; SSE-NEXT: movq -56(%rsp,%rbx), %rsi
-; SSE-NEXT: shldq %cl, %rsi, %rdx
-; SSE-NEXT: shldq %cl, %r15, %r11
-; SSE-NEXT: shldq %cl, %r14, %r9
-; SSE-NEXT: movq -64(%rsp,%rbx), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %rsi
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rbx
-; SSE-NEXT: andq 32(%rdi), %r9
-; SSE-NEXT: andq 48(%rdi), %r11
-; SSE-NEXT: andq 16(%rdi), %rdx
-; SSE-NEXT: orq %r11, %rdx
-; SSE-NEXT: andq 40(%rdi), %r8
-; SSE-NEXT: andq 56(%rdi), %r10
-; SSE-NEXT: andq 24(%rdi), %rax
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: andq (%rdi), %rbx
-; SSE-NEXT: orq %r9, %rbx
-; SSE-NEXT: orq %rdx, %rbx
-; SSE-NEXT: andq 8(%rdi), %rsi
-; SSE-NEXT: orq %r8, %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: orq %rbx, %rsi
-; SSE-NEXT: setne %al
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: test_ne_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rsi
-; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx
-; AVX2-NEXT: movq %rbx, %rax
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq -16(%rsp,%rsi), %r11
-; AVX2-NEXT: movq -8(%rsp,%rsi), %r10
-; AVX2-NEXT: shldq %cl, %r11, %r10
-; AVX2-NEXT: movq -32(%rsp,%rsi), %r9
-; AVX2-NEXT: movq -24(%rsp,%rsi), %r14
-; AVX2-NEXT: movq %r14, %r8
-; AVX2-NEXT: shldq %cl, %r9, %r8
-; AVX2-NEXT: movq -64(%rsp,%rsi), %r15
-; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi
-; AVX2-NEXT: shldq %cl, %rsi, %rdx
-; AVX2-NEXT: shldq %cl, %r14, %r11
-; AVX2-NEXT: shldq %cl, %rbx, %r9
-; AVX2-NEXT: shldq %cl, %r15, %rsi
-; AVX2-NEXT: shlxq %rcx, %r15, %rcx
-; AVX2-NEXT: andq 32(%rdi), %r9
-; AVX2-NEXT: andq 48(%rdi), %r11
-; AVX2-NEXT: andq 16(%rdi), %rdx
-; AVX2-NEXT: andq 40(%rdi), %r8
-; AVX2-NEXT: andq 56(%rdi), %r10
-; AVX2-NEXT: andq 24(%rdi), %rax
-; AVX2-NEXT: orq %r11, %rdx
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: andq (%rdi), %rcx
-; AVX2-NEXT: orq %r9, %rcx
-; AVX2-NEXT: orq %rdx, %rcx
-; AVX2-NEXT: andq 8(%rdi), %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: orq %rax, %rsi
-; AVX2-NEXT: orq %rcx, %rsi
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_ne_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx
-; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %rax
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq -16(%rsp,%rbx), %r11
-; AVX512-NEXT: movq -8(%rsp,%rbx), %r10
-; AVX512-NEXT: shldq %cl, %r11, %r10
-; AVX512-NEXT: movq -32(%rsp,%rbx), %r9
-; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT: movq %r15, %r8
-; AVX512-NEXT: shldq %cl, %r9, %r8
-; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi
-; AVX512-NEXT: shldq %cl, %rsi, %rdx
-; AVX512-NEXT: shldq %cl, %r15, %r11
-; AVX512-NEXT: shldq %cl, %r14, %r9
-; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT: shldq %cl, %rbx, %rsi
-; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT: andq 32(%rdi), %r9
-; AVX512-NEXT: andq 48(%rdi), %r11
-; AVX512-NEXT: andq 16(%rdi), %rdx
-; AVX512-NEXT: andq 40(%rdi), %r8
-; AVX512-NEXT: andq 56(%rdi), %r10
-; AVX512-NEXT: andq 24(%rdi), %rax
-; AVX512-NEXT: orq %r11, %rdx
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: andq (%rdi), %rcx
-; AVX512-NEXT: orq %r9, %rcx
-; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: andq 8(%rdi), %rsi
-; AVX512-NEXT: orq %r8, %rsi
-; AVX512-NEXT: orq %rax, %rsi
-; AVX512-NEXT: orq %rcx, %rsi
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: test_ne_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: andl $60, %eax
+; X64-NEXT: movl (%rdi,%rax), %eax
+; X64-NEXT: btl %esi, %eax
+; X64-NEXT: setb %al
+; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -1602,572 +853,33 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $272, %esp # imm = 0x110
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edx), %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edx), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 52(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl 56(%edx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: movl 24(%edx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%eax), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 12(%eax), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl 60(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 28(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: negl %eax
-; X86-NEXT: movl 240(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl 32(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: movl (%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 16(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: movl 48(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 36(%esi), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl %esi, %edi
-; X86-NEXT: movl 52(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl %ebx, 60(%edx)
-; X86-NEXT: movl %edi, 56(%edx)
-; X86-NEXT: movl %ecx, 52(%edx)
-; X86-NEXT: movl %esi, 44(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 40(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 36(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 32(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 28(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 24(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 20(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 16(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 8(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 4(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 48(%edx)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btcl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: complement_ne_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rbx
-; SSE-NEXT: movq (%rsp,%rbx), %rsi
-; SSE-NEXT: movq 8(%rsp,%rbx), %r14
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: shldq %cl, %rsi, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 32(%rsp,%rbx), %r8
-; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT: shldq %cl, %r8, %rbp
-; SSE-NEXT: movq 16(%rsp,%rbx), %r9
-; SSE-NEXT: movq 24(%rsp,%rbx), %r15
-; SSE-NEXT: movq %r15, %r10
-; SSE-NEXT: shldq %cl, %r9, %r10
-; SSE-NEXT: movq -8(%rsp,%rbx), %r11
-; SSE-NEXT: shldq %cl, %r11, %rsi
-; SSE-NEXT: shldq %cl, %r15, %r8
-; SSE-NEXT: shldq %cl, %r14, %r9
-; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %r11
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rbx
-; SSE-NEXT: movq 24(%rdi), %r15
-; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 16(%rdi), %r12
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %r8, %r13
-; SSE-NEXT: andq %rsi, %r12
-; SSE-NEXT: orq %r13, %r12
-; SSE-NEXT: movq %rcx, %r13
-; SSE-NEXT: andq %rbp, %r13
-; SSE-NEXT: andq %rax, %r15
-; SSE-NEXT: orq %r13, %r15
-; SSE-NEXT: movq 32(%rdi), %r14
-; SSE-NEXT: movq %r14, %rcx
-; SSE-NEXT: andq %r9, %rcx
-; SSE-NEXT: movq (%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rbx, %r13
-; SSE-NEXT: orq %rcx, %r13
-; SSE-NEXT: orq %r12, %r13
-; SSE-NEXT: movq 40(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r12
-; SSE-NEXT: andq %r10, %r12
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movq %rdx, %rax
-; SSE-NEXT: andq %r11, %rax
-; SSE-NEXT: orq %r12, %rax
-; SSE-NEXT: orq %r15, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT: xorq %rcx, %r10
-; SSE-NEXT: xorq %r14, %r9
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT: xorq %rdx, %r11
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: movq %r8, 48(%rdi)
-; SSE-NEXT: movq %rbp, 56(%rdi)
-; SSE-NEXT: movq %r9, 32(%rdi)
-; SSE-NEXT: movq %r10, 40(%rdi)
-; SSE-NEXT: movq %rsi, 16(%rdi)
-; SSE-NEXT: movq %r15, 24(%rdi)
-; SSE-NEXT: movq %rbx, (%rdi)
-; SSE-NEXT: movq %r11, 8(%rdi)
-; SSE-NEXT: setne %al
-; SSE-NEXT: addq $56, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: complement_ne_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $72, %rsp
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, (%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rbx
-; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT: movq %rbp, %rax
-; AVX2-NEXT: shldq %cl, %rsi, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT: shldq %cl, %r8, %r13
-; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: shldq %cl, %r9, %r10
-; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT: shldq %cl, %r11, %rsi
-; AVX2-NEXT: shldq %cl, %r14, %r8
-; AVX2-NEXT: movq 16(%rdi), %r12
-; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r8, %r14
-; AVX2-NEXT: andq %rsi, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq 56(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r13, %r15
-; AVX2-NEXT: movq 24(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %rax, %r14
-; AVX2-NEXT: orq %r15, %r14
-; AVX2-NEXT: shldq %cl, %rbp, %r9
-; AVX2-NEXT: movq (%rsp,%rbx), %rdx
-; AVX2-NEXT: movq 32(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r9, %r15
-; AVX2-NEXT: shlxq %rcx, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq (%rdi), %rbx
-; AVX2-NEXT: movq %rbx, %rbp
-; AVX2-NEXT: andq %rax, %rbp
-; AVX2-NEXT: orq %r15, %rbp
-; AVX2-NEXT: orq %r12, %rbp
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %rdx, %r11
-; AVX2-NEXT: movq 40(%rdi), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: andq %r10, %rcx
-; AVX2-NEXT: movq 8(%rdi), %r15
-; AVX2-NEXT: movq %r15, %r12
-; AVX2-NEXT: andq %r11, %r12
-; AVX2-NEXT: orq %rcx, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT: xorq %rax, %r10
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: xorq %r15, %r11
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: movq %r8, 48(%rdi)
-; AVX2-NEXT: movq %r13, 56(%rdi)
-; AVX2-NEXT: movq %r9, 32(%rdi)
-; AVX2-NEXT: movq %r10, 40(%rdi)
-; AVX2-NEXT: movq %rsi, 16(%rdi)
-; AVX2-NEXT: movq %rcx, 24(%rdi)
-; AVX2-NEXT: movq %rbx, (%rdi)
-; AVX2-NEXT: movq %r11, 8(%rdi)
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: addq $72, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: complement_ne_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $72, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, (%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT: movq %rbp, %rax
-; AVX512-NEXT: shldq %cl, %rsi, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT: shldq %cl, %r8, %r13
-; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %r10
-; AVX512-NEXT: shldq %cl, %r9, %r10
-; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT: shldq %cl, %r11, %rsi
-; AVX512-NEXT: shldq %cl, %r14, %r8
-; AVX512-NEXT: movq 16(%rdi), %r12
-; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r8, %r14
-; AVX512-NEXT: andq %rsi, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq 56(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r13, %r15
-; AVX512-NEXT: movq 24(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %rax, %r14
-; AVX512-NEXT: orq %r15, %r14
-; AVX512-NEXT: shldq %cl, %rbp, %r9
-; AVX512-NEXT: movq (%rsp,%rbx), %rdx
-; AVX512-NEXT: movq 32(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r9, %r15
-; AVX512-NEXT: shlxq %rcx, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq (%rdi), %rbx
-; AVX512-NEXT: movq %rbx, %rbp
-; AVX512-NEXT: andq %rax, %rbp
-; AVX512-NEXT: orq %r15, %rbp
-; AVX512-NEXT: orq %r12, %rbp
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rdx, %r11
-; AVX512-NEXT: movq 40(%rdi), %rax
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andq %r10, %rcx
-; AVX512-NEXT: movq 8(%rdi), %r15
-; AVX512-NEXT: movq %r15, %r12
-; AVX512-NEXT: andq %r11, %r12
-; AVX512-NEXT: orq %rcx, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT: xorq %rax, %r10
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT: xorq %r15, %r11
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT: orq %rbp, %r12
-; AVX512-NEXT: movq %r8, 48(%rdi)
-; AVX512-NEXT: movq %r13, 56(%rdi)
-; AVX512-NEXT: movq %r9, 32(%rdi)
-; AVX512-NEXT: movq %r10, 40(%rdi)
-; AVX512-NEXT: movq %rsi, 16(%rdi)
-; AVX512-NEXT: movq %rcx, 24(%rdi)
-; AVX512-NEXT: movq %rbx, (%rdi)
-; AVX512-NEXT: movq %r11, 8(%rdi)
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: addq $72, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: complement_ne_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $60, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btcl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -2182,606 +894,33 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $288, %esp # imm = 0x120
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-NEXT: subl %eax, %edi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 4(%edi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edi), %eax
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: shldl %cl, %edx, %ebx
-; X86-NEXT: movl 12(%edi), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edi), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edi), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edi), %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl 52(%edi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl 56(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl 44(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edi), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%edi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: negl %eax
-; X86-NEXT: movl 256(%esp,%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: movl 32(%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %edx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: andl %edi, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: movl 52(%ebx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 60(%eax)
-; X86-NEXT: movl %esi, 56(%eax)
-; X86-NEXT: movl %ecx, 52(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 44(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 40(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 36(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 32(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 28(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 24(%eax)
-; X86-NEXT: movl %ebx, 20(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 16(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 8(%eax)
-; X86-NEXT: movl %edi, 4(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 48(%eax)
-; X86-NEXT: sete %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_eq_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rdx
-; SSE-NEXT: movq (%rsp,%rdx), %r9
-; SSE-NEXT: movq 8(%rsp,%rdx), %r8
-; SSE-NEXT: movq %r8, %rsi
-; SSE-NEXT: shldq %cl, %r9, %rsi
-; SSE-NEXT: movq -8(%rsp,%rdx), %rax
-; SSE-NEXT: shldq %cl, %rax, %r9
-; SSE-NEXT: movq 16(%rsp,%rdx), %r14
-; SSE-NEXT: movq 24(%rsp,%rdx), %r10
-; SSE-NEXT: movq %r10, %rbx
-; SSE-NEXT: shldq %cl, %r14, %rbx
-; SSE-NEXT: shldq %cl, %r8, %r14
-; SSE-NEXT: movq 32(%rsp,%rdx), %r13
-; SSE-NEXT: movq 40(%rsp,%rdx), %r12
-; SSE-NEXT: shldq %cl, %r13, %r12
-; SSE-NEXT: shldq %cl, %r10, %r13
-; SSE-NEXT: movq -16(%rsp,%rdx), %rdx
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq %r12, %rbp
-; SSE-NEXT: movq %r9, %r15
-; SSE-NEXT: movq %rsi, %r11
-; SSE-NEXT: movq 16(%rdi), %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %r13
-; SSE-NEXT: andq %r8, %r9
-; SSE-NEXT: orq %r13, %r9
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %r12
-; SSE-NEXT: movq 24(%rdi), %r10
-; SSE-NEXT: andq %r10, %rsi
-; SSE-NEXT: orq %r12, %rsi
-; SSE-NEXT: movq %r14, %r13
-; SSE-NEXT: movq 32(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %r14
-; SSE-NEXT: movq %rdx, %r12
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %rdx
-; SSE-NEXT: orq %r14, %rdx
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: movq %rbx, %r14
-; SSE-NEXT: movq 40(%rdi), %rcx
-; SSE-NEXT: andq %rcx, %rbx
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: movq 8(%rdi), %r8
-; SSE-NEXT: andq %r8, %rax
-; SSE-NEXT: orq %rbx, %rax
-; SSE-NEXT: orq %rsi, %rax
-; SSE-NEXT: notq %r11
-; SSE-NEXT: andq %r10, %r11
-; SSE-NEXT: notq %r15
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: notq %r14
-; SSE-NEXT: andq %rcx, %r14
-; SSE-NEXT: notq %r13
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT: notq %rbp
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: notq %rcx
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT: notq %r9
-; SSE-NEXT: andq %r8, %r9
-; SSE-NEXT: notq %r12
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rcx, 48(%rdi)
-; SSE-NEXT: movq %rbp, 56(%rdi)
-; SSE-NEXT: movq %r13, 32(%rdi)
-; SSE-NEXT: movq %r14, 40(%rdi)
-; SSE-NEXT: movq %r15, 16(%rdi)
-; SSE-NEXT: movq %r11, 24(%rdi)
-; SSE-NEXT: movq %r12, (%rdi)
-; SSE-NEXT: movq %r9, 8(%rdi)
-; SSE-NEXT: sete %al
-; SSE-NEXT: addq $56, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: reset_eq_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rdx
-; AVX2-NEXT: movq -48(%rsp,%rdx), %r8
-; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx
-; AVX2-NEXT: movq %rbx, %rax
-; AVX2-NEXT: shldq %cl, %r8, %rax
-; AVX2-NEXT: movq -16(%rsp,%rdx), %r10
-; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi
-; AVX2-NEXT: shldq %cl, %r10, %rsi
-; AVX2-NEXT: movq -32(%rsp,%rdx), %r11
-; AVX2-NEXT: movq -24(%rsp,%rdx), %r14
-; AVX2-NEXT: movq %r14, %r9
-; AVX2-NEXT: shldq %cl, %r11, %r9
-; AVX2-NEXT: movq -64(%rsp,%rdx), %r15
-; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx
-; AVX2-NEXT: shldq %cl, %rdx, %r8
-; AVX2-NEXT: shldq %cl, %r14, %r10
-; AVX2-NEXT: shldq %cl, %rbx, %r11
-; AVX2-NEXT: shldq %cl, %r15, %rdx
-; AVX2-NEXT: shlxq %rcx, %r15, %rcx
-; AVX2-NEXT: movq 24(%rdi), %rbx
-; AVX2-NEXT: movq 56(%rdi), %r14
-; AVX2-NEXT: movq 16(%rdi), %r15
-; AVX2-NEXT: movq 48(%rdi), %r13
-; AVX2-NEXT: movq 32(%rdi), %rbp
-; AVX2-NEXT: andnq %rbp, %r11, %r12
-; AVX2-NEXT: andq %r11, %rbp
-; AVX2-NEXT: andnq %r13, %r10, %r11
-; AVX2-NEXT: andq %r10, %r13
-; AVX2-NEXT: andnq %r15, %r8, %r10
-; AVX2-NEXT: andq %r8, %r15
-; AVX2-NEXT: movq 40(%rdi), %r8
-; AVX2-NEXT: orq %r13, %r15
-; AVX2-NEXT: andnq %r8, %r9, %r13
-; AVX2-NEXT: andq %r9, %r8
-; AVX2-NEXT: andnq %r14, %rsi, %r9
-; AVX2-NEXT: andq %rsi, %r14
-; AVX2-NEXT: andnq %rbx, %rax, %rsi
-; AVX2-NEXT: andq %rax, %rbx
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: orq %r14, %rbx
-; AVX2-NEXT: andnq %rax, %rcx, %r14
-; AVX2-NEXT: andq %rcx, %rax
-; AVX2-NEXT: orq %rbp, %rax
-; AVX2-NEXT: movq 8(%rdi), %rcx
-; AVX2-NEXT: orq %r15, %rax
-; AVX2-NEXT: andnq %rcx, %rdx, %r15
-; AVX2-NEXT: andq %rdx, %rcx
-; AVX2-NEXT: orq %r8, %rcx
-; AVX2-NEXT: orq %rbx, %rcx
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: movq %r11, 48(%rdi)
-; AVX2-NEXT: movq %r9, 56(%rdi)
-; AVX2-NEXT: movq %r12, 32(%rdi)
-; AVX2-NEXT: movq %r13, 40(%rdi)
-; AVX2-NEXT: movq %r10, 16(%rdi)
-; AVX2-NEXT: movq %rsi, 24(%rdi)
-; AVX2-NEXT: movq %r14, (%rdi)
-; AVX2-NEXT: movq %r15, 8(%rdi)
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: addq $8, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: reset_eq_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: pushq %rax
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq -48(%rsp,%rbx), %r8
-; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %rax
-; AVX512-NEXT: shldq %cl, %r8, %rax
-; AVX512-NEXT: movq -16(%rsp,%rbx), %r10
-; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi
-; AVX512-NEXT: shldq %cl, %r10, %rsi
-; AVX512-NEXT: movq -32(%rsp,%rbx), %r11
-; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT: movq %r15, %r9
-; AVX512-NEXT: shldq %cl, %r11, %r9
-; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx
-; AVX512-NEXT: shldq %cl, %rdx, %r8
-; AVX512-NEXT: shldq %cl, %r15, %r10
-; AVX512-NEXT: shldq %cl, %r14, %r11
-; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT: shldq %cl, %rbx, %rdx
-; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT: movq 24(%rdi), %rbx
-; AVX512-NEXT: movq 56(%rdi), %r14
-; AVX512-NEXT: movq 16(%rdi), %r15
-; AVX512-NEXT: movq 48(%rdi), %r13
-; AVX512-NEXT: movq 32(%rdi), %rbp
-; AVX512-NEXT: andnq %rbp, %r11, %r12
-; AVX512-NEXT: andq %r11, %rbp
-; AVX512-NEXT: andnq %r13, %r10, %r11
-; AVX512-NEXT: andq %r10, %r13
-; AVX512-NEXT: andnq %r15, %r8, %r10
-; AVX512-NEXT: andq %r8, %r15
-; AVX512-NEXT: movq 40(%rdi), %r8
-; AVX512-NEXT: orq %r13, %r15
-; AVX512-NEXT: andnq %r8, %r9, %r13
-; AVX512-NEXT: andq %r9, %r8
-; AVX512-NEXT: andnq %r14, %rsi, %r9
-; AVX512-NEXT: andq %rsi, %r14
-; AVX512-NEXT: andnq %rbx, %rax, %rsi
-; AVX512-NEXT: andq %rax, %rbx
-; AVX512-NEXT: movq (%rdi), %rax
-; AVX512-NEXT: orq %r14, %rbx
-; AVX512-NEXT: andnq %rax, %rcx, %r14
-; AVX512-NEXT: andq %rcx, %rax
-; AVX512-NEXT: orq %rbp, %rax
-; AVX512-NEXT: movq 8(%rdi), %rcx
-; AVX512-NEXT: orq %r15, %rax
-; AVX512-NEXT: andnq %rcx, %rdx, %r15
-; AVX512-NEXT: andq %rdx, %rcx
-; AVX512-NEXT: orq %r8, %rcx
-; AVX512-NEXT: orq %rbx, %rcx
-; AVX512-NEXT: orq %rax, %rcx
-; AVX512-NEXT: movq %r11, 48(%rdi)
-; AVX512-NEXT: movq %r9, 56(%rdi)
-; AVX512-NEXT: movq %r12, 32(%rdi)
-; AVX512-NEXT: movq %r13, 40(%rdi)
-; AVX512-NEXT: movq %r10, 16(%rdi)
-; AVX512-NEXT: movq %rsi, 24(%rdi)
-; AVX512-NEXT: movq %r14, (%rdi)
-; AVX512-NEXT: movq %r15, 8(%rdi)
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: addq $8, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: reset_eq_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $60, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setae %al
+; X64-NEXT: btrl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -2797,572 +936,33 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $272, %esp # imm = 0x110
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edx), %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edx), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 52(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl 56(%edx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: movl 24(%edx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%eax), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 12(%eax), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl 60(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 28(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: negl %eax
-; X86-NEXT: movl 240(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl 32(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: movl (%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 16(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: movl 48(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 36(%esi), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl %esi, %edi
-; X86-NEXT: movl 52(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl %ebx, 60(%edx)
-; X86-NEXT: movl %edi, 56(%edx)
-; X86-NEXT: movl %ecx, 52(%edx)
-; X86-NEXT: movl %esi, 44(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 40(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 36(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 32(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 28(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 24(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 20(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 16(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 8(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 4(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 48(%edx)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btsl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: set_ne_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rbx
-; SSE-NEXT: movq (%rsp,%rbx), %rsi
-; SSE-NEXT: movq 8(%rsp,%rbx), %r14
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: shldq %cl, %rsi, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 32(%rsp,%rbx), %r8
-; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT: shldq %cl, %r8, %rbp
-; SSE-NEXT: movq 16(%rsp,%rbx), %r9
-; SSE-NEXT: movq 24(%rsp,%rbx), %r15
-; SSE-NEXT: movq %r15, %r10
-; SSE-NEXT: shldq %cl, %r9, %r10
-; SSE-NEXT: movq -8(%rsp,%rbx), %r11
-; SSE-NEXT: shldq %cl, %r11, %rsi
-; SSE-NEXT: shldq %cl, %r15, %r8
-; SSE-NEXT: shldq %cl, %r14, %r9
-; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %r11
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rbx
-; SSE-NEXT: movq 24(%rdi), %r15
-; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 16(%rdi), %r12
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %r8, %r13
-; SSE-NEXT: andq %rsi, %r12
-; SSE-NEXT: orq %r13, %r12
-; SSE-NEXT: movq %rcx, %r13
-; SSE-NEXT: andq %rbp, %r13
-; SSE-NEXT: andq %rax, %r15
-; SSE-NEXT: orq %r13, %r15
-; SSE-NEXT: movq 32(%rdi), %r14
-; SSE-NEXT: movq %r14, %rcx
-; SSE-NEXT: andq %r9, %rcx
-; SSE-NEXT: movq (%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rbx, %r13
-; SSE-NEXT: orq %rcx, %r13
-; SSE-NEXT: orq %r12, %r13
-; SSE-NEXT: movq 40(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r12
-; SSE-NEXT: andq %r10, %r12
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movq %rdx, %rax
-; SSE-NEXT: andq %r11, %rax
-; SSE-NEXT: orq %r12, %rax
-; SSE-NEXT: orq %r15, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT: orq %rcx, %r10
-; SSE-NEXT: orq %r14, %r9
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT: orq %rdx, %r11
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: movq %r8, 48(%rdi)
-; SSE-NEXT: movq %rbp, 56(%rdi)
-; SSE-NEXT: movq %r9, 32(%rdi)
-; SSE-NEXT: movq %r10, 40(%rdi)
-; SSE-NEXT: movq %rsi, 16(%rdi)
-; SSE-NEXT: movq %r15, 24(%rdi)
-; SSE-NEXT: movq %rbx, (%rdi)
-; SSE-NEXT: movq %r11, 8(%rdi)
-; SSE-NEXT: setne %al
-; SSE-NEXT: addq $56, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: set_ne_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $72, %rsp
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, (%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rbx
-; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT: movq %rbp, %rax
-; AVX2-NEXT: shldq %cl, %rsi, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT: shldq %cl, %r8, %r13
-; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: shldq %cl, %r9, %r10
-; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT: shldq %cl, %r11, %rsi
-; AVX2-NEXT: shldq %cl, %r14, %r8
-; AVX2-NEXT: movq 16(%rdi), %r12
-; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r8, %r14
-; AVX2-NEXT: andq %rsi, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq 56(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r13, %r15
-; AVX2-NEXT: movq 24(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %rax, %r14
-; AVX2-NEXT: orq %r15, %r14
-; AVX2-NEXT: shldq %cl, %rbp, %r9
-; AVX2-NEXT: movq (%rsp,%rbx), %rdx
-; AVX2-NEXT: movq 32(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r9, %r15
-; AVX2-NEXT: shlxq %rcx, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq (%rdi), %rbx
-; AVX2-NEXT: movq %rbx, %rbp
-; AVX2-NEXT: andq %rax, %rbp
-; AVX2-NEXT: orq %r15, %rbp
-; AVX2-NEXT: orq %r12, %rbp
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %rdx, %r11
-; AVX2-NEXT: movq 40(%rdi), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: andq %r10, %rcx
-; AVX2-NEXT: movq 8(%rdi), %r15
-; AVX2-NEXT: movq %r15, %r12
-; AVX2-NEXT: andq %r11, %r12
-; AVX2-NEXT: orq %rcx, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT: orq %rax, %r10
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: orq %r15, %r11
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: movq %r8, 48(%rdi)
-; AVX2-NEXT: movq %r13, 56(%rdi)
-; AVX2-NEXT: movq %r9, 32(%rdi)
-; AVX2-NEXT: movq %r10, 40(%rdi)
-; AVX2-NEXT: movq %rsi, 16(%rdi)
-; AVX2-NEXT: movq %rcx, 24(%rdi)
-; AVX2-NEXT: movq %rbx, (%rdi)
-; AVX2-NEXT: movq %r11, 8(%rdi)
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: addq $72, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: set_ne_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $72, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, (%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT: movq %rbp, %rax
-; AVX512-NEXT: shldq %cl, %rsi, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT: shldq %cl, %r8, %r13
-; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %r10
-; AVX512-NEXT: shldq %cl, %r9, %r10
-; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT: shldq %cl, %r11, %rsi
-; AVX512-NEXT: shldq %cl, %r14, %r8
-; AVX512-NEXT: movq 16(%rdi), %r12
-; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r8, %r14
-; AVX512-NEXT: andq %rsi, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq 56(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r13, %r15
-; AVX512-NEXT: movq 24(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %rax, %r14
-; AVX512-NEXT: orq %r15, %r14
-; AVX512-NEXT: shldq %cl, %rbp, %r9
-; AVX512-NEXT: movq (%rsp,%rbx), %rdx
-; AVX512-NEXT: movq 32(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r9, %r15
-; AVX512-NEXT: shlxq %rcx, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq (%rdi), %rbx
-; AVX512-NEXT: movq %rbx, %rbp
-; AVX512-NEXT: andq %rax, %rbp
-; AVX512-NEXT: orq %r15, %rbp
-; AVX512-NEXT: orq %r12, %rbp
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rdx, %r11
-; AVX512-NEXT: movq 40(%rdi), %rax
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andq %r10, %rcx
-; AVX512-NEXT: movq 8(%rdi), %r15
-; AVX512-NEXT: movq %r15, %r12
-; AVX512-NEXT: andq %r11, %r12
-; AVX512-NEXT: orq %rcx, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT: orq %rax, %r10
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT: orq %r15, %r11
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT: orq %rbp, %r12
-; AVX512-NEXT: movq %r8, 48(%rdi)
-; AVX512-NEXT: movq %r13, 56(%rdi)
-; AVX512-NEXT: movq %r9, 32(%rdi)
-; AVX512-NEXT: movq %r10, 40(%rdi)
-; AVX512-NEXT: movq %rsi, 16(%rdi)
-; AVX512-NEXT: movq %rcx, 24(%rdi)
-; AVX512-NEXT: movq %rbx, (%rdi)
-; AVX512-NEXT: movq %r11, 8(%rdi)
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: addq $72, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: set_ne_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $60, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btsl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -3383,13 +983,14 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $432, %esp # imm = 0x1B0
+; X86-NEXT: subl $352, %esp # imm = 0x160
; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: shrl $3, %edx
; X86-NEXT: andl $60, %edx
-; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -3422,60 +1023,58 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%esi), %eax
+; X86-NEXT: movl 56(%eax), %esi
+; X86-NEXT: movl 60(%eax), %ebx
+; X86-NEXT: movl 52(%eax), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%eax), %edi
+; X86-NEXT: movl 44(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%eax), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%esi), %eax
+; X86-NEXT: movzbl 16(%ebp), %eax
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shldl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%esi), %eax
-; X86-NEXT: movl 48(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esi), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%esi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl 16(%ebp), %ebx
-; X86-NEXT: movzbl %bl, %esi
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl %cl, %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3500,9 +1099,12 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -3534,273 +1136,148 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edx
+; X86-NEXT: movl 56(%eax), %esi
+; X86-NEXT: movl 60(%eax), %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: andl 60(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 52(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 56(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 48(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 52(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 44(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 48(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 40(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 44(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 36(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 40(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 32(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 36(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 28(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 32(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 24(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 28(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 20(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 24(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 16(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 20(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 12(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 16(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 8(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 12(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 4(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 8(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: andl 4(%edx), %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: notl %esi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: andl (%edx), %esi
+; X86-NEXT: orl %eax, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%edx,%eax), %eax
+; X86-NEXT: btl %ecx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %eax, 60(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, 56(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %eax, 52(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%ebx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, 48(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl 56(%edi), %ebx
-; X86-NEXT: movl 60(%edi), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 52(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 48(%edi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, 44(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl 40(%edi), %ebx
-; X86-NEXT: movl 44(%edi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 36(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 32(%edi), %ebx
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 28(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 24(%edi), %ebx
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 20(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 16(%edi), %ebx
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 12(%edi), %eax
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 8(%edi), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, 40(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 4(%edi), %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl %eax, 36(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl (%edi), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %eax, 32(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 60(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 56(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 52(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 44(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 40(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 36(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 32(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 28(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 24(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 20(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 16(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %esi, 48(%eax)
-; X86-NEXT: sete %al
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 20(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 16(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%edx)
+; X86-NEXT: movl %ebx, 8(%edx)
+; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %esi, (%edx)
+; X86-NEXT: setae %al
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -3816,7 +1293,8 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-NEXT: pushq %r13
; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $216, %rsp
+; SSE-NEXT: subq $168, %rsp
+; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
@@ -3829,139 +1307,107 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %r10
-; SSE-NEXT: movq 184(%rsp,%r10), %r11
-; SSE-NEXT: movq 192(%rsp,%r10), %rsi
-; SSE-NEXT: movq %rsi, %r13
-; SSE-NEXT: shldq %cl, %r11, %r13
-; SSE-NEXT: movq 200(%rsp,%r10), %r15
-; SSE-NEXT: shldq %cl, %rsi, %r15
-; SSE-NEXT: movq 168(%rsp,%r10), %rbx
-; SSE-NEXT: movq 176(%rsp,%r10), %rsi
-; SSE-NEXT: movq %rsi, %r14
-; SSE-NEXT: shldq %cl, %rbx, %r14
-; SSE-NEXT: shldq %cl, %rsi, %r11
-; SSE-NEXT: movq 152(%rsp,%r10), %rax
-; SSE-NEXT: movq 160(%rsp,%r10), %r8
-; SSE-NEXT: movq %r8, %r12
-; SSE-NEXT: shldq %cl, %rax, %r12
-; SSE-NEXT: shldq %cl, %r8, %rbx
-; SSE-NEXT: movq 144(%rsp,%r10), %r9
-; SSE-NEXT: movq %r9, %r8
-; SSE-NEXT: shlq %cl, %r8
-; SSE-NEXT: shldq %cl, %r9, %rax
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movl %edx, %edx
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %eax
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: andl $56, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: movslq %eax, %r12
+; SSE-NEXT: movq 136(%rsp,%r12), %r9
+; SSE-NEXT: movq 144(%rsp,%r12), %rax
+; SSE-NEXT: movq %rax, %rsi
+; SSE-NEXT: shldq %cl, %r9, %rsi
+; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 152(%rsp,%r12), %r11
+; SSE-NEXT: shldq %cl, %rax, %r11
+; SSE-NEXT: movq 120(%rsp,%r12), %r10
+; SSE-NEXT: movq 128(%rsp,%r12), %rax
+; SSE-NEXT: movq %rax, %rbx
+; SSE-NEXT: shldq %cl, %r10, %rbx
+; SSE-NEXT: shldq %cl, %rax, %r9
+; SSE-NEXT: movq 104(%rsp,%r12), %r14
+; SSE-NEXT: movq 112(%rsp,%r12), %rax
+; SSE-NEXT: movq %rax, %r15
+; SSE-NEXT: shldq %cl, %r14, %r15
+; SSE-NEXT: shldq %cl, %rax, %r10
+; SSE-NEXT: movq 96(%rsp,%r12), %rax
+; SSE-NEXT: movq %rax, %r13
+; SSE-NEXT: shlq %cl, %r13
+; SSE-NEXT: shldq %cl, %rax, %r14
+; SSE-NEXT: movl %edx, %eax
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq 16(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %rsi
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rsi, %r13
-; SSE-NEXT: andq %rdx, %r12
-; SSE-NEXT: orq %r13, %r12
-; SSE-NEXT: movq %r15, %rsi
-; SSE-NEXT: movq 56(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %r15
-; SSE-NEXT: movq %rbx, %r13
-; SSE-NEXT: movq 24(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %rbx
-; SSE-NEXT: orq %r15, %rbx
-; SSE-NEXT: movq %r14, %rbp
-; SSE-NEXT: movq 32(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %r14
-; SSE-NEXT: movq %r8, %r15
-; SSE-NEXT: movq (%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %r8
-; SSE-NEXT: orq %r14, %r8
-; SSE-NEXT: orq %r12, %r8
-; SSE-NEXT: movq %r11, %r12
-; SSE-NEXT: movq 40(%rdi), %r9
-; SSE-NEXT: andq %r9, %r11
-; SSE-NEXT: movq %rax, %r14
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %rax
-; SSE-NEXT: orq %r11, %rax
-; SSE-NEXT: orq %rbx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 8(%rsp,%r12), %r8
+; SSE-NEXT: movq 16(%rsp,%r12), %rsi
+; SSE-NEXT: movq %rsi, %rbp
+; SSE-NEXT: shldq %cl, %r8, %rbp
; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE-NEXT: notq %rax
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: movq 56(%rsp,%r10), %r11
-; SSE-NEXT: movq 64(%rsp,%r10), %rax
-; SSE-NEXT: movq %rax, %rbx
-; SSE-NEXT: shldq %cl, %r11, %rbx
-; SSE-NEXT: orq %rbx, %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movq 72(%rsp,%r10), %rbx
-; SSE-NEXT: shldq %cl, %rax, %rbx
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT: orq %rbx, %rsi
-; SSE-NEXT: notq %rbp
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: movq 40(%rsp,%r10), %rax
-; SSE-NEXT: movq 48(%rsp,%r10), %rdx
-; SSE-NEXT: movq %rdx, %rbx
-; SSE-NEXT: shldq %cl, %rax, %rbx
-; SSE-NEXT: orq %rbx, %rbp
-; SSE-NEXT: notq %r12
-; SSE-NEXT: andq %r9, %r12
-; SSE-NEXT: shldq %cl, %rdx, %r11
-; SSE-NEXT: movq 24(%rsp,%r10), %r9
-; SSE-NEXT: movq 32(%rsp,%r10), %rdx
-; SSE-NEXT: movq %rdx, %rbx
-; SSE-NEXT: shldq %cl, %r9, %rbx
-; SSE-NEXT: orq %r11, %r12
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: andq 48(%rdi), %rax
+; SSE-NEXT: orq %rbp, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: notq %rbx
; SSE-NEXT: notq %r11
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: orq %rbx, %r11
-; SSE-NEXT: notq %r13
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT: orq %rax, %r13
+; SSE-NEXT: movq 24(%rsp,%r12), %rax
+; SSE-NEXT: shldq %cl, %rsi, %rax
+; SSE-NEXT: movq -8(%rsp,%r12), %rbp
+; SSE-NEXT: movq (%rsp,%r12), %rdx
+; SSE-NEXT: movq %rdx, %rsi
+; SSE-NEXT: shldq %cl, %rbp, %rsi
+; SSE-NEXT: andq 56(%rdi), %r11
+; SSE-NEXT: andq 32(%rdi), %rbx
+; SSE-NEXT: orq %rax, %r11
+; SSE-NEXT: orq %rsi, %rbx
; SSE-NEXT: notq %r15
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: movq 16(%rsp,%r10), %rax
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: orq %rdx, %r15
+; SSE-NEXT: shldq %cl, %rdx, %r8
+; SSE-NEXT: notq %r9
+; SSE-NEXT: andq 40(%rdi), %r9
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: movq -24(%rsp,%r12), %rax
+; SSE-NEXT: movq -16(%rsp,%r12), %rdx
+; SSE-NEXT: movq %rdx, %rsi
+; SSE-NEXT: shldq %cl, %rax, %rsi
+; SSE-NEXT: andq 16(%rdi), %r15
+; SSE-NEXT: orq %rsi, %r15
+; SSE-NEXT: shldq %cl, %rdx, %rbp
+; SSE-NEXT: notq %r10
+; SSE-NEXT: notq %r13
+; SSE-NEXT: movq -32(%rsp,%r12), %rdx
+; SSE-NEXT: movq %rdx, %rsi
+; SSE-NEXT: shlq %cl, %rsi
+; SSE-NEXT: andq 24(%rdi), %r10
+; SSE-NEXT: andq (%rdi), %r13
+; SSE-NEXT: orq %rbp, %r10
+; SSE-NEXT: orq %rsi, %r13
; SSE-NEXT: notq %r14
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shldq %cl, %rax, %r9
-; SSE-NEXT: orq %r9, %r14
-; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: andq 8(%rdi), %r14
+; SSE-NEXT: orq %rax, %r14
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andl $60, %eax
+; SSE-NEXT: movl (%rdi,%rax), %eax
+; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; SSE-NEXT: btl %ecx, %eax
; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE-NEXT: movq %rax, 48(%rdi)
-; SSE-NEXT: movq %rsi, 56(%rdi)
-; SSE-NEXT: movq %rbp, 32(%rdi)
-; SSE-NEXT: movq %r12, 40(%rdi)
-; SSE-NEXT: movq %r11, 16(%rdi)
-; SSE-NEXT: movq %r13, 24(%rdi)
-; SSE-NEXT: movq %r15, (%rdi)
+; SSE-NEXT: movq %r11, 56(%rdi)
+; SSE-NEXT: movq %rbx, 32(%rdi)
+; SSE-NEXT: movq %r9, 40(%rdi)
+; SSE-NEXT: movq %r15, 16(%rdi)
+; SSE-NEXT: movq %r10, 24(%rdi)
+; SSE-NEXT: movq %r13, (%rdi)
; SSE-NEXT: movq %r14, 8(%rdi)
-; SSE-NEXT: sete %al
-; SSE-NEXT: addq $216, %rsp
+; SSE-NEXT: setae %al
+; SSE-NEXT: addq $168, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r12
; SSE-NEXT: popq %r13
@@ -3978,132 +1424,105 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $200, %rsp
+; AVX2-NEXT: subq $184, %rsp
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %r8d
-; AVX2-NEXT: andl $63, %r8d
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rsi
-; AVX2-NEXT: movq 144(%rsp,%rsi), %r11
-; AVX2-NEXT: movq 152(%rsp,%rsi), %r12
-; AVX2-NEXT: movq %r12, %r10
-; AVX2-NEXT: movl %r8d, %ecx
-; AVX2-NEXT: shldq %cl, %r11, %r10
-; AVX2-NEXT: movq 176(%rsp,%rsi), %r14
-; AVX2-NEXT: movq 184(%rsp,%rsi), %r9
+; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: movl %esi, %ebx
+; AVX2-NEXT: shrl $3, %ebx
+; AVX2-NEXT: movl %ebx, %eax
+; AVX2-NEXT: andl $56, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: movslq %eax, %r11
+; AVX2-NEXT: movq 128(%rsp,%r11), %r15
+; AVX2-NEXT: movq 136(%rsp,%r11), %rax
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: shldq %cl, %r15, %rsi
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 120(%rsp,%r11), %r8
+; AVX2-NEXT: shldq %cl, %r8, %r15
+; AVX2-NEXT: movq 144(%rsp,%r11), %r14
+; AVX2-NEXT: movq 152(%rsp,%r11), %rsi
+; AVX2-NEXT: movq %rsi, %r9
; AVX2-NEXT: shldq %cl, %r14, %r9
-; AVX2-NEXT: movq 160(%rsp,%rsi), %r15
-; AVX2-NEXT: movq 168(%rsp,%rsi), %r13
-; AVX2-NEXT: movq %r13, %rbx
-; AVX2-NEXT: shldq %cl, %r15, %rbx
-; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp
-; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 136(%rsp,%rsi), %rax
-; AVX2-NEXT: shldq %cl, %rax, %r11
-; AVX2-NEXT: shldq %cl, %r13, %r14
-; AVX2-NEXT: shldq %cl, %r12, %r15
-; AVX2-NEXT: shldq %cl, %rbp, %rax
+; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rax, %r14
+; AVX2-NEXT: movq 112(%rsp,%r11), %rax
; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl %edx, %edx
+; AVX2-NEXT: movq 160(%rsp,%r11), %r13
+; AVX2-NEXT: movq 168(%rsp,%r11), %r12
+; AVX2-NEXT: shldq %cl, %r13, %r12
+; AVX2-NEXT: shldq %cl, %rsi, %r13
+; AVX2-NEXT: shldq %cl, %rax, %r8
+; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, (%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq 16(%rdi), %r12
-; AVX2-NEXT: movq 48(%rdi), %rbp
-; AVX2-NEXT: movq 32(%rdi), %r13
-; AVX2-NEXT: andnq %r13, %r15, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r15, %r13
-; AVX2-NEXT: andnq %rbp, %r14, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r14, %rbp
-; AVX2-NEXT: andnq %r12, %r11, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r11, %r12
-; AVX2-NEXT: movq 40(%rdi), %rax
-; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: andnq %rax, %rbx, %rcx
-; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq %rax, %rbp
-; AVX2-NEXT: andq %rbx, %rbp
-; AVX2-NEXT: movq 56(%rdi), %rcx
-; AVX2-NEXT: andnq %rcx, %r9, %rbx
-; AVX2-NEXT: andq %r9, %rcx
-; AVX2-NEXT: movq 24(%rdi), %rax
-; AVX2-NEXT: andnq %rax, %r10, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r10, %rax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: movq (%rdi), %r10
-; AVX2-NEXT: andnq %r10, %rcx, %r15
-; AVX2-NEXT: andq %rcx, %r10
-; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq 48(%rsp,%rsi), %r11
-; AVX2-NEXT: movq %r11, %r9
-; AVX2-NEXT: movl %r8d, %ecx
-; AVX2-NEXT: shldq %cl, %rdx, %r9
-; AVX2-NEXT: orq %r13, %r10
-; AVX2-NEXT: orq %r12, %r10
-; AVX2-NEXT: movq 8(%rdi), %r13
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: andnq %r13, %rcx, %r12
-; AVX2-NEXT: andq %rcx, %r13
-; AVX2-NEXT: orq %rbp, %r13
+; AVX2-NEXT: movq 24(%rsp,%r11), %rbp
+; AVX2-NEXT: movq 32(%rsp,%r11), %rdx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shldq %cl, %rbp, %rax
+; AVX2-NEXT: movq 40(%rsp,%r11), %r10
+; AVX2-NEXT: shldq %cl, %rdx, %r10
+; AVX2-NEXT: movq 8(%rsp,%r11), %r9
+; AVX2-NEXT: movq 16(%rsp,%r11), %rdx
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: shldq %cl, %r9, %r8
+; AVX2-NEXT: shldq %cl, %rdx, %rbp
+; AVX2-NEXT: andnq 48(%rdi), %r13, %r13
; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq 56(%rsp,%rsi), %rax
-; AVX2-NEXT: movl %r8d, %ecx
-; AVX2-NEXT: shldq %cl, %r11, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: orq %r9, %r14
-; AVX2-NEXT: orq %rax, %rbx
-; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 24(%rsp,%rsi), %rax
-; AVX2-NEXT: movq 32(%rsp,%rsi), %r9
-; AVX2-NEXT: movq %r9, %r11
-; AVX2-NEXT: shldq %cl, %rax, %r11
-; AVX2-NEXT: shldq %cl, %r9, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT: orq %r11, %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: orq %rdx, %rbx
-; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq 16(%rsp,%rsi), %r9
-; AVX2-NEXT: movq %r9, %r11
-; AVX2-NEXT: shldq %cl, %rdx, %r11
-; AVX2-NEXT: shldq %cl, %r9, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: orq %r11, %r9
-; AVX2-NEXT: movq (%rsp,%rsi), %rsi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: orq %rax, %r11
-; AVX2-NEXT: shlxq %r8, %rsi, %rax
-; AVX2-NEXT: shldq %cl, %rsi, %rdx
-; AVX2-NEXT: orq %rax, %r15
-; AVX2-NEXT: orq %rdx, %r12
-; AVX2-NEXT: orq %r10, %r13
-; AVX2-NEXT: movq %r14, 48(%rdi)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: movq %rax, 56(%rdi)
-; AVX2-NEXT: movq %rbp, 32(%rdi)
-; AVX2-NEXT: movq %rbx, 40(%rdi)
-; AVX2-NEXT: movq %r9, 16(%rdi)
-; AVX2-NEXT: movq %r11, 24(%rdi)
-; AVX2-NEXT: movq %r15, (%rdi)
-; AVX2-NEXT: movq %r12, 8(%rdi)
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: addq $200, %rsp
+; AVX2-NEXT: movq -8(%rsp,%r11), %rax
+; AVX2-NEXT: movq (%rsp,%r11), %rdx
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: shldq %cl, %rax, %rsi
+; AVX2-NEXT: shldq %cl, %rdx, %r9
+; AVX2-NEXT: andnq 56(%rdi), %r12, %r12
+; AVX2-NEXT: andnq 32(%rdi), %r14, %r14
+; AVX2-NEXT: orq %r10, %r12
+; AVX2-NEXT: orq %r8, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: andnq 40(%rdi), %rdx, %rdx
+; AVX2-NEXT: orq %rbp, %rdx
+; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT: movq -16(%rsp,%r11), %r10
+; AVX2-NEXT: shlxq %rcx, %r10, %r11
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %r10, %rax
+; AVX2-NEXT: andnq 16(%rdi), %r15, %rcx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andnq 24(%rdi), %r10, %r10
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: orq %r9, %r10
+; AVX2-NEXT: andnq (%rdi), %r8, %rsi
+; AVX2-NEXT: orq %r11, %rsi
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andnq 8(%rdi), %r8, %r8
+; AVX2-NEXT: orq %rax, %r8
+; AVX2-NEXT: andl $60, %ebx
+; AVX2-NEXT: movl (%rdi,%rbx), %eax
+; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; AVX2-NEXT: btl %r9d, %eax
+; AVX2-NEXT: movq %r13, 48(%rdi)
+; AVX2-NEXT: movq %r12, 56(%rdi)
+; AVX2-NEXT: movq %r14, 32(%rdi)
+; AVX2-NEXT: movq %rdx, 40(%rdi)
+; AVX2-NEXT: movq %rcx, 16(%rdi)
+; AVX2-NEXT: movq %r10, 24(%rdi)
+; AVX2-NEXT: movq %rsi, (%rdi)
+; AVX2-NEXT: movq %r8, 8(%rdi)
+; AVX2-NEXT: setae %al
+; AVX2-NEXT: addq $184, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
@@ -4121,39 +1540,41 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; AVX512-NEXT: pushq %r13
; AVX512-NEXT: pushq %r12
; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $184, %rsp
+; AVX512-NEXT: subq $168, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512-NEXT: movl %esi, %ecx
; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rsi
-; AVX512-NEXT: movq 128(%rsp,%rsi), %r10
-; AVX512-NEXT: movq 136(%rsp,%rsi), %r12
-; AVX512-NEXT: movq %r12, %rax
-; AVX512-NEXT: shldq %cl, %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 160(%rsp,%rsi), %r14
-; AVX512-NEXT: movq 168(%rsp,%rsi), %rax
-; AVX512-NEXT: shldq %cl, %r14, %rax
+; AVX512-NEXT: movl %esi, %r10d
+; AVX512-NEXT: shrl $3, %r10d
+; AVX512-NEXT: movl %r10d, %r8d
+; AVX512-NEXT: andl $56, %r8d
+; AVX512-NEXT: negl %r8d
+; AVX512-NEXT: movslq %r8d, %r9
+; AVX512-NEXT: movq 112(%rsp,%r9), %r11
+; AVX512-NEXT: movq 120(%rsp,%r9), %r14
+; AVX512-NEXT: movq %r14, %rax
+; AVX512-NEXT: shldq %cl, %r11, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 144(%rsp,%rsi), %r15
-; AVX512-NEXT: movq 152(%rsp,%rsi), %r11
-; AVX512-NEXT: movq %r11, %rbx
+; AVX512-NEXT: movq 104(%rsp,%r9), %rax
+; AVX512-NEXT: shldq %cl, %rax, %r11
+; AVX512-NEXT: movq 128(%rsp,%r9), %r15
+; AVX512-NEXT: movq 136(%rsp,%r9), %rbp
+; AVX512-NEXT: movq %rbp, %rbx
; AVX512-NEXT: shldq %cl, %r15, %rbx
-; AVX512-NEXT: movq 120(%rsp,%rsi), %rax
+; AVX512-NEXT: shldq %cl, %r14, %r15
+; AVX512-NEXT: movq 144(%rsp,%r9), %r13
+; AVX512-NEXT: movq 152(%rsp,%r9), %r12
+; AVX512-NEXT: shldq %cl, %r13, %r12
+; AVX512-NEXT: movq 96(%rsp,%r9), %r14
+; AVX512-NEXT: shldq %cl, %rbp, %r13
+; AVX512-NEXT: shldq %cl, %r14, %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rax, %r10
-; AVX512-NEXT: shldq %cl, %r11, %r14
-; AVX512-NEXT: movq %rdi, %r9
-; AVX512-NEXT: movq 112(%rsp,%rsi), %r11
-; AVX512-NEXT: shldq %cl, %r12, %r15
; AVX512-NEXT: movl %edx, %edx
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
@@ -4162,90 +1583,59 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq 16(%rdi), %r12
-; AVX512-NEXT: movq 48(%rdi), %r13
-; AVX512-NEXT: movq 32(%rdi), %rbp
-; AVX512-NEXT: andnq %rbp, %r15, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r15, %rbp
-; AVX512-NEXT: andnq %r13, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r14, %r13
-; AVX512-NEXT: andnq %r12, %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r10, %r12
-; AVX512-NEXT: movq 40(%rdi), %r8
-; AVX512-NEXT: orq %r13, %r12
-; AVX512-NEXT: andnq %r8, %rbx, %rdi
-; AVX512-NEXT: andq %rbx, %r8
-; AVX512-NEXT: movq 56(%r9), %r13
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT: andnq %r13, %rdx, %r10
-; AVX512-NEXT: andq %rdx, %r13
-; AVX512-NEXT: movq 24(%r9), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT: andnq %rax, %rdx, %r15
-; AVX512-NEXT: andq %rdx, %rax
-; AVX512-NEXT: orq %r13, %rax
-; AVX512-NEXT: shlxq %rcx, %r11, %r13
-; AVX512-NEXT: movq (%r9), %rdx
-; AVX512-NEXT: andnq %rdx, %r13, %r14
-; AVX512-NEXT: andq %r13, %rdx
-; AVX512-NEXT: orq %rbp, %rdx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r11, %rbp
-; AVX512-NEXT: orq %r12, %rdx
-; AVX512-NEXT: movq 8(%r9), %r13
-; AVX512-NEXT: andnq %r13, %rbp, %rbx
-; AVX512-NEXT: andq %rbp, %r13
-; AVX512-NEXT: orq %r8, %r13
-; AVX512-NEXT: movq 24(%rsp,%rsi), %r8
-; AVX512-NEXT: orq %rax, %r13
-; AVX512-NEXT: movq 32(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, %r12
-; AVX512-NEXT: shldq %cl, %r8, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: orq %r12, %r11
-; AVX512-NEXT: movq 40(%rsp,%rsi), %r12
-; AVX512-NEXT: shldq %cl, %rax, %r12
-; AVX512-NEXT: orq %r12, %r10
-; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 8(%rsp,%rsi), %rax
-; AVX512-NEXT: movq 16(%rsp,%rsi), %r12
-; AVX512-NEXT: movq %r12, %rbp
-; AVX512-NEXT: shldq %cl, %rax, %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: orq %rbp, %r10
-; AVX512-NEXT: shldq %cl, %r12, %r8
-; AVX512-NEXT: orq %r8, %rdi
-; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq -8(%rsp,%rsi), %r8
-; AVX512-NEXT: movq (%rsp,%rsi), %r12
-; AVX512-NEXT: movq %r12, %rbp
+; AVX512-NEXT: movq 8(%rsp,%r9), %r8
+; AVX512-NEXT: movq 16(%rsp,%r9), %rax
+; AVX512-NEXT: movq %rax, %rbp
; AVX512-NEXT: shldq %cl, %r8, %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT: orq %rbp, %rdi
-; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi
-; AVX512-NEXT: shldq %cl, %r12, %rax
-; AVX512-NEXT: orq %rax, %r15
-; AVX512-NEXT: shlxq %rcx, %rsi, %rax
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: andnq 48(%rdi), %r13, %r13
+; AVX512-NEXT: orq %rbp, %r13
+; AVX512-NEXT: movq 24(%rsp,%r9), %rbp
+; AVX512-NEXT: shldq %cl, %rax, %rbp
+; AVX512-NEXT: movq -8(%rsp,%r9), %rax
+; AVX512-NEXT: movq (%rsp,%r9), %rsi
+; AVX512-NEXT: movq %rsi, %rdx
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: andnq 56(%rdi), %r12, %r12
+; AVX512-NEXT: orq %rbp, %r12
+; AVX512-NEXT: andnq 32(%rdi), %r15, %r15
+; AVX512-NEXT: orq %rdx, %r15
; AVX512-NEXT: shldq %cl, %rsi, %r8
-; AVX512-NEXT: orq %rax, %r14
+; AVX512-NEXT: movq -24(%rsp,%r9), %rdx
+; AVX512-NEXT: movq -16(%rsp,%r9), %rsi
+; AVX512-NEXT: movq %rsi, %rbp
+; AVX512-NEXT: shldq %cl, %rdx, %rbp
+; AVX512-NEXT: andnq 40(%rdi), %rbx, %rbx
; AVX512-NEXT: orq %r8, %rbx
-; AVX512-NEXT: orq %rdx, %r13
-; AVX512-NEXT: movq %r11, 48(%r9)
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, 56(%r9)
-; AVX512-NEXT: movq %r10, 32(%r9)
+; AVX512-NEXT: andnq 16(%rdi), %r11, %r8
+; AVX512-NEXT: orq %rbp, %r8
+; AVX512-NEXT: shlxq %rcx, %r14, %r11
+; AVX512-NEXT: movq -32(%rsp,%r9), %r9
+; AVX512-NEXT: shldq %cl, %rsi, %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT: andnq 24(%rdi), %rsi, %rsi
+; AVX512-NEXT: orq %rax, %rsi
+; AVX512-NEXT: shlxq %rcx, %r9, %rax
+; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: shldq %cl, %r9, %rdx
+; AVX512-NEXT: andnq (%rdi), %r11, %rcx
+; AVX512-NEXT: orq %rax, %rcx
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, 40(%r9)
-; AVX512-NEXT: movq %rdi, 16(%r9)
-; AVX512-NEXT: movq %r15, 24(%r9)
-; AVX512-NEXT: movq %r14, (%r9)
-; AVX512-NEXT: movq %rbx, 8(%r9)
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: addq $184, %rsp
+; AVX512-NEXT: andnq 8(%rdi), %rax, %rax
+; AVX512-NEXT: orq %rdx, %rax
+; AVX512-NEXT: andl $60, %r10d
+; AVX512-NEXT: movl (%rdi,%r10), %edx
+; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; AVX512-NEXT: btl %r9d, %edx
+; AVX512-NEXT: movq %r13, 48(%rdi)
+; AVX512-NEXT: movq %r12, 56(%rdi)
+; AVX512-NEXT: movq %r15, 32(%rdi)
+; AVX512-NEXT: movq %rbx, 40(%rdi)
+; AVX512-NEXT: movq %r8, 16(%rdi)
+; AVX512-NEXT: movq %rsi, 24(%rdi)
+; AVX512-NEXT: movq %rcx, (%rdi)
+; AVX512-NEXT: movq %rax, 8(%rdi)
+; AVX512-NEXT: setae %al
+; AVX512-NEXT: addq $168, %rsp
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r12
; AVX512-NEXT: popq %r13
@@ -4274,2749 +1664,25 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i4096:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $1792, %esp # imm = 0x700
-; X86-NEXT: movl 12(%ebp), %ebx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: shrl $3, %ecx
-; X86-NEXT: andl $508, %ecx # imm = 0x1FC
-; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 248(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 252(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ebx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 504(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 508(%esi), %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 120(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 124(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 376(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 380(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 184(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 188(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 440(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 444(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 312(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 316(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 216(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 220(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 472(%esi), %edi
-; X86-NEXT: movl 476(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 88(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 92(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 344(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 348(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 152(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 156(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 408(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 412(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 280(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 284(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 232(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 236(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 488(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 492(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 104(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 108(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 360(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 364(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 168(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 172(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 424(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 428(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 296(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 300(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 200(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 204(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 456(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 460(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 76(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 328(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 332(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 136(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 140(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 392(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 396(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 264(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 268(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 240(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 244(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 496(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 500(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 112(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 116(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 368(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 372(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 176(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 180(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 432(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 436(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 304(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 308(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 208(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 212(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 464(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 468(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 80(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 84(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 336(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 340(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 144(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 148(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 400(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 404(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 272(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 276(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 224(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 228(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 480(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 484(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 100(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 352(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 356(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 160(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 164(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 416(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 420(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 288(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 292(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 192(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 196(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 448(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 452(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 64(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 68(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 320(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 324(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 128(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 132(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: movl 256(%esi), %edi
-; X86-NEXT: movl 260(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 388(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 4(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shrdl $1, %eax, %edi
-; X86-NEXT: shrl %eax
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: notb %cl
-; X86-NEXT: shrdl %cl, %eax, %edi
-; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: movb $32, %cl
-; X86-NEXT: testb %cl, %cl
-; X86-NEXT: movl (%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: jne .LBB20_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: .LBB20_2:
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 320(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 64(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 448(%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 192(%eax), %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 288(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 32(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 416(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 160(%eax), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 352(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 96(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 480(%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 224(%eax), %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 272(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 16(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 400(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 144(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 336(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 80(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 464(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 208(%eax), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 304(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 48(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 432(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 176(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 368(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 112(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 496(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: andl 240(%eax), %ebx
-; X86-NEXT: orl %ecx, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 264(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 8(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 392(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 136(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 328(%ebx), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 72(%ebx), %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 456(%ebx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 200(%ebx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 296(%ebx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 40(%ebx), %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 424(%ebx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 168(%ebx), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 360(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 104(%ebx), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 488(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 232(%ebx), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 280(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 24(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 408(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 152(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 344(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 88(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 472(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 216(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 312(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 56(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 440(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 184(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 376(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 120(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 504(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 248(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 324(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 68(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 452(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 196(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 292(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 36(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 420(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 164(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 356(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 100(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 484(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 228(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 276(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 20(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 404(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 148(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 340(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 84(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 468(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 212(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 308(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 52(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 436(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 180(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 372(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 116(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 500(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 244(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 268(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 12(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 396(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 140(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 332(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 76(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 460(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 204(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 300(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 44(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 428(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 172(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 364(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 108(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 492(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 236(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 284(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 28(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 412(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 156(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 348(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 92(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 476(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 220(%ebx), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 316(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 60(%ebx), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 444(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 188(%ebx), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 380(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 124(%ebx), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 508(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: andl 252(%esi), %ebx
-; X86-NEXT: orl %ecx, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: negl %ecx
-; X86-NEXT: movl 1648(%esp,%ecx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: andl 128(%edx), %ecx
-; X86-NEXT: andl 384(%edx), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: andl (%edx), %eax
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 256(%edx), %eax
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 260(%edx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 4(%edx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 132(%edx), %eax
-; X86-NEXT: andl 388(%edx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $4064, %edx # imm = 0xFE0
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
-; SSE-LABEL: test_ne_i4096:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $1576, %rsp # imm = 0x628
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl %esi, %eax
-; SSE-NEXT: andl $4032, %eax # imm = 0xFC0
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: negl %eax
-; SSE-NEXT: movslq %eax, %rsi
-; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1304(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1560(%rsp,%rsi), %rax
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1176(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1432(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1240(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1496(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1112(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; SSE-NEXT: movq 1368(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1272(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1528(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1144(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1400(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1208(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1464(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1080(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1336(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1288(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1544(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1160(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1416(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1216(%rsp,%rsi), %r11
-; SSE-NEXT: movq 1224(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %r11, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1480(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1088(%rsp,%rsi), %r9
-; SSE-NEXT: movq 1096(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %r9, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1352(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1248(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1512(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1120(%rsp,%rsi), %rax
-; SSE-NEXT: movq 1128(%rsp,%rsi), %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rax, %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1376(%rsp,%rsi), %r13
-; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx
-; SSE-NEXT: movq %rbx, %r8
-; SSE-NEXT: shldq %cl, %r13, %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1192(%rsp,%rsi), %r15
-; SSE-NEXT: movq %r15, %r14
-; SSE-NEXT: shldq %cl, %rdx, %r14
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1440(%rsp,%rsi), %r10
-; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, %r14
-; SSE-NEXT: shldq %cl, %r10, %r14
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1312(%rsp,%rsi), %r14
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp
-; SSE-NEXT: movq %rbp, %r12
-; SSE-NEXT: shldq %cl, %r14, %r12
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx
-; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rbp, %r14
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r9
-; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %rbp
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r13
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r12, %r15
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r12, %r10
-; SSE-NEXT: andq 384(%rdi), %r10
-; SSE-NEXT: andq 128(%rdi), %r15
-; SSE-NEXT: andq 320(%rdi), %r13
-; SSE-NEXT: andq 64(%rdi), %rax
-; SSE-NEXT: orq %r10, %r15
-; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: andq 448(%rdi), %r9
-; SSE-NEXT: andq 192(%rdi), %rbp
-; SSE-NEXT: orq %r9, %rbp
-; SSE-NEXT: orq %rax, %rbp
-; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq 288(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 32(%rdi), %r9
-; SSE-NEXT: andq 416(%rdi), %rdx
-; SSE-NEXT: andq 160(%rdi), %r11
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: orq %rdx, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 352(%rdi), %rdx
-; SSE-NEXT: orq %r9, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 96(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 480(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 224(%rdi), %r8
-; SSE-NEXT: orq %rax, %r8
-; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq 272(%rdi), %r14
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 16(%rdi), %rax
-; SSE-NEXT: orq %r14, %rax
-; SSE-NEXT: movq %rax, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 400(%rdi), %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 144(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: movq %rax, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 336(%rdi), %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 80(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 464(%rdi), %rdx
-; SSE-NEXT: orq %r9, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 208(%rdi), %r11
-; SSE-NEXT: orq %rdx, %r11
-; SSE-NEXT: orq %rax, %r11
-; SSE-NEXT: orq %r8, %r11
-; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload
-; SSE-NEXT: andq 304(%rdi), %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 48(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 432(%rdi), %r9
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 176(%rdi), %r8
-; SSE-NEXT: orq %r9, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 368(%rdi), %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 112(%rdi), %rax
-; SSE-NEXT: orq %r10, %r8
-; SSE-NEXT: movq %r8, %r10
-; SSE-NEXT: orq %r9, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 496(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT: andq 240(%rdi), %rbp
-; SSE-NEXT: orq %r8, %rbp
-; SSE-NEXT: orq %rax, %rbp
-; SSE-NEXT: orq %r10, %rbp
-; SSE-NEXT: orq %r11, %rbp
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 392(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT: andq 136(%rdi), %r12
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 328(%rdi), %rdx
-; SSE-NEXT: orq %rax, %r12
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 72(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 456(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE-NEXT: andq 200(%rdi), %r13
-; SSE-NEXT: orq %rax, %r13
-; SSE-NEXT: orq %rdx, %r13
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 296(%rdi), %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 40(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 424(%rdi), %r8
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 168(%rdi), %rdx
-; SSE-NEXT: orq %r8, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 360(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 104(%rdi), %rax
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: movq %rax, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 488(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: andq 232(%rdi), %r15
-; SSE-NEXT: orq %rax, %r15
-; SSE-NEXT: orq %r8, %r15
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 280(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 24(%rdi), %rax
-; SSE-NEXT: orq %rdx, %r15
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 408(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 152(%rdi), %rax
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 344(%rdi), %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 88(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 472(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT: andq 216(%rdi), %r14
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: orq %rax, %r14
-; SSE-NEXT: orq %r8, %r14
-; SSE-NEXT: orq %r10, %r14
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 312(%rdi), %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT: andq 56(%rdi), %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 440(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 184(%rdi), %r9
-; SSE-NEXT: orq %r11, %r10
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: orq %r10, %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT: andq 376(%rdi), %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 120(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 504(%rdi), %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 248(%rdi), %r8
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: movq 1056(%rsp,%rsi), %rax
-; SSE-NEXT: shldq %cl, %rax, %rbx
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: orq %r10, %r8
-; SSE-NEXT: orq %r9, %r8
-; SSE-NEXT: andq 256(%rdi), %rdx
-; SSE-NEXT: orq %r14, %r8
-; SSE-NEXT: andq (%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: orq %rbp, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: andq 264(%rdi), %rcx
-; SSE-NEXT: andq 8(%rdi), %rbx
-; SSE-NEXT: orq %rcx, %rbx
-; SSE-NEXT: orq %r12, %rbx
-; SSE-NEXT: orq %r13, %rbx
-; SSE-NEXT: orq %r15, %rbx
-; SSE-NEXT: orq %r8, %rbx
-; SSE-NEXT: orq %rax, %rbx
-; SSE-NEXT: setne %al
-; SSE-NEXT: addq $1576, %rsp # imm = 0x628
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: test_ne_i4096:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $1560, %rsp # imm = 0x618
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl %esi, %eax
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: negl %eax
-; AVX2-NEXT: movslq %eax, %rsi
-; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11
-; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %r11, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12
-; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %r12, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rax, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp
-; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rbp, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax
-; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rax, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10
-; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8
-; AVX2-NEXT: movq %r8, %rdx
-; AVX2-NEXT: shldq %cl, %r10, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx
-; AVX2-NEXT: movq %rbx, %rdx
-; AVX2-NEXT: shldq %cl, %r9, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9
-; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, %r14
-; AVX2-NEXT: shldq %cl, %r9, %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14
-; AVX2-NEXT: movq %r14, %r13
-; AVX2-NEXT: shldq %cl, %r15, %r13
-; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx
-; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r13
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, %r9
-; AVX2-NEXT: andq 384(%rdi), %r9
-; AVX2-NEXT: andq 128(%rdi), %r14
-; AVX2-NEXT: andq 320(%rdi), %r10
-; AVX2-NEXT: orq %r9, %r14
-; AVX2-NEXT: movq %r14, %r15
-; AVX2-NEXT: andq 64(%rdi), %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: andq 448(%rdi), %rbp
-; AVX2-NEXT: andq 192(%rdi), %r13
-; AVX2-NEXT: orq %rbp, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq 288(%rdi), %r8
-; AVX2-NEXT: andq 32(%rdi), %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 416(%rdi), %rax
-; AVX2-NEXT: orq %r8, %r12
-; AVX2-NEXT: andq 160(%rdi), %r11
-; AVX2-NEXT: orq %rax, %r11
-; AVX2-NEXT: andq 352(%rdi), %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 96(%rdi), %rax
-; AVX2-NEXT: orq %r12, %r11
-; AVX2-NEXT: orq %rbx, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 480(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: andq 224(%rdi), %r13
-; AVX2-NEXT: orq %r10, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 272(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 16(%rdi), %rax
-; AVX2-NEXT: orq %r11, %r13
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: andq 400(%rdi), %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 144(%rdi), %rax
-; AVX2-NEXT: orq %r9, %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 336(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 80(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 464(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: andq 208(%rdi), %r11
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: orq %r8, %r11
-; AVX2-NEXT: orq %rax, %r11
-; AVX2-NEXT: orq %r9, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: andq 304(%rdi), %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 48(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 432(%rdi), %r10
-; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload
-; AVX2-NEXT: andq 176(%rdi), %rax
-; AVX2-NEXT: orq %r9, %r8
-; AVX2-NEXT: movq %r8, %r9
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 368(%rdi), %r8
-; AVX2-NEXT: orq %r9, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 112(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 496(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: andq 240(%rdi), %r9
-; AVX2-NEXT: orq %r8, %r9
-; AVX2-NEXT: orq %rax, %r9
-; AVX2-NEXT: orq %r10, %r9
-; AVX2-NEXT: orq %r11, %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 392(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT: andq 136(%rdi), %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 328(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 72(%rdi), %rax
-; AVX2-NEXT: orq %r10, %rbp
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 456(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX2-NEXT: andq 200(%rdi), %r12
-; AVX2-NEXT: orq %rax, %r12
-; AVX2-NEXT: orq %r8, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 296(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 40(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: andq 424(%rdi), %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 168(%rdi), %rax
-; AVX2-NEXT: orq %r10, %r8
-; AVX2-NEXT: movq %r8, %r10
-; AVX2-NEXT: orq %r11, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 360(%rdi), %r8
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 104(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 488(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: andq 232(%rdi), %r14
-; AVX2-NEXT: orq %rax, %r14
-; AVX2-NEXT: orq %r8, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 280(%rdi), %r8
-; AVX2-NEXT: orq %r10, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 24(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 408(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 152(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: andq 344(%rdi), %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 88(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 472(%rdi), %rax
-; AVX2-NEXT: orq %r11, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: andq 216(%rdi), %rbx
-; AVX2-NEXT: orq %rax, %rbx
-; AVX2-NEXT: orq %r8, %rbx
-; AVX2-NEXT: orq %r10, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 312(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 56(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 440(%rdi), %r10
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 184(%rdi), %r8
-; AVX2-NEXT: orq %r10, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 376(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 120(%rdi), %rax
-; AVX2-NEXT: orq %r11, %r8
-; AVX2-NEXT: movq %r8, %r11
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 504(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 248(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r8, %r10
-; AVX2-NEXT: orq %r11, %rax
-; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi
-; AVX2-NEXT: orq %rbx, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: shlxq %rcx, %rsi, %rax
-; AVX2-NEXT: andq 256(%rdi), %r10
-; AVX2-NEXT: andq (%rdi), %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: orq %r15, %rax
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT: orq %r13, %rax
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %rsi, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: andq 264(%rdi), %rcx
-; AVX2-NEXT: andq 8(%rdi), %rdx
-; AVX2-NEXT: orq %r9, %rax
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: orq %rbp, %rdx
-; AVX2-NEXT: orq %r12, %rdx
-; AVX2-NEXT: orq %r14, %rdx
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: orq %rax, %rdx
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: addq $1560, %rsp # imm = 0x618
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_ne_i4096:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $1560, %rsp # imm = 0x618
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl %esi, %eax
-; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %eax
-; AVX512-NEXT: negl %eax
-; AVX512-NEXT: movslq %eax, %rsi
-; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10
-; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14
-; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12
-; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %r12, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax
-; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11
-; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx
-; AVX512-NEXT: movq %rbx, %rdx
-; AVX512-NEXT: shldq %cl, %r11, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9
-; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8
-; AVX512-NEXT: movq %r8, %rdx
-; AVX512-NEXT: shldq %cl, %r9, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9
-; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, %r15
-; AVX512-NEXT: shldq %cl, %r9, %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp
-; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15
-; AVX512-NEXT: movq %r15, %r13
-; AVX512-NEXT: shldq %cl, %rbp, %r13
-; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx
-; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %rbx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r14
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r13
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbp, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbp, %r9
-; AVX512-NEXT: andq 384(%rdi), %r9
-; AVX512-NEXT: andq 128(%rdi), %r15
-; AVX512-NEXT: orq %r9, %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq 320(%rdi), %r11
-; AVX512-NEXT: andq 64(%rdi), %rax
-; AVX512-NEXT: orq %r11, %rax
-; AVX512-NEXT: andq 448(%rdi), %r12
-; AVX512-NEXT: andq 192(%rdi), %r13
-; AVX512-NEXT: orq %r12, %r13
-; AVX512-NEXT: orq %rax, %r13
-; AVX512-NEXT: andq 288(%rdi), %r8
-; AVX512-NEXT: andq 32(%rdi), %r14
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 416(%rdi), %rax
-; AVX512-NEXT: orq %r8, %r14
-; AVX512-NEXT: andq 160(%rdi), %r10
-; AVX512-NEXT: orq %rax, %r10
-; AVX512-NEXT: andq 352(%rdi), %rbx
-; AVX512-NEXT: orq %r14, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 96(%rdi), %rax
-; AVX512-NEXT: orq %rbx, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 480(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: andq 224(%rdi), %r15
-; AVX512-NEXT: orq %rax, %r15
-; AVX512-NEXT: orq %r8, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 272(%rdi), %r8
-; AVX512-NEXT: orq %r10, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 16(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 400(%rdi), %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 144(%rdi), %rax
-; AVX512-NEXT: orq %r9, %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 336(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 80(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 464(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: andq 208(%rdi), %r11
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: orq %r8, %r11
-; AVX512-NEXT: orq %rax, %r11
-; AVX512-NEXT: orq %r9, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 304(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 48(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 432(%rdi), %r9
-; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 176(%rdi), %r8
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: orq %r9, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 368(%rdi), %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 112(%rdi), %rax
-; AVX512-NEXT: orq %r10, %r8
-; AVX512-NEXT: movq %r8, %r10
-; AVX512-NEXT: orq %r9, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 496(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 240(%rdi), %r9
-; AVX512-NEXT: orq %r8, %r9
-; AVX512-NEXT: orq %rax, %r9
-; AVX512-NEXT: orq %r10, %r9
-; AVX512-NEXT: orq %r11, %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 392(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: andq 136(%rdi), %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 328(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 72(%rdi), %rax
-; AVX512-NEXT: orq %r10, %rbp
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 456(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT: andq 200(%rdi), %r12
-; AVX512-NEXT: orq %rax, %r12
-; AVX512-NEXT: orq %r8, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 296(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 40(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 424(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 168(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 360(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 104(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 488(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT: andq 232(%rdi), %r14
-; AVX512-NEXT: orq %rax, %r14
-; AVX512-NEXT: orq %r8, %r14
-; AVX512-NEXT: orq %r10, %r14
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 280(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 24(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 408(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 152(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: andq 344(%rdi), %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 88(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 472(%rdi), %rax
-; AVX512-NEXT: orq %r11, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: andq 216(%rdi), %rbx
-; AVX512-NEXT: orq %rax, %rbx
-; AVX512-NEXT: orq %r8, %rbx
-; AVX512-NEXT: orq %r10, %rbx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 312(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 56(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 440(%rdi), %r8
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 184(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 376(%rdi), %r8
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 120(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 504(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 248(%rdi), %r8
-; AVX512-NEXT: orq %rax, %r8
-; AVX512-NEXT: orq %r10, %r8
-; AVX512-NEXT: orq %r11, %r8
-; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rsi, %r10
-; AVX512-NEXT: orq %rbx, %r8
-; AVX512-NEXT: shlxq %rcx, %rax, %rsi
-; AVX512-NEXT: andq 256(%rdi), %r10
-; AVX512-NEXT: andq (%rdi), %rsi
-; AVX512-NEXT: orq %r10, %rsi
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT: orq %r13, %rsi
-; AVX512-NEXT: orq %r15, %rsi
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: orq %r9, %rsi
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 264(%rdi), %rax
-; AVX512-NEXT: andq 8(%rdi), %rdx
-; AVX512-NEXT: orq %rax, %rdx
-; AVX512-NEXT: orq %rbp, %rdx
-; AVX512-NEXT: orq %r12, %rdx
-; AVX512-NEXT: orq %r14, %rdx
-; AVX512-NEXT: orq %r8, %rdx
-; AVX512-NEXT: orq %rsi, %rdx
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: addq $1560, %rsp # imm = 0x618
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: test_ne_i4096:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $4064, %eax # imm = 0xFE0
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: movl (%rdi,%rax), %eax
+; X64-NEXT: btl %esi, %eax
+; X64-NEXT: setb %al
+; X64-NEXT: retq
%rem = and i32 %position, 4095
%ofs = zext nneg i32 %rem to i4096
%bit = shl nuw i4096 1, %ofs
@@ -7161,8 +1827,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: subl $64, %esp
+; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -7176,51 +1842,33 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %esi
-; X86-NEXT: movl 60(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %edi
-; X86-NEXT: movl 52(%esp,%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl 8(%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %ecx
-; X86-NEXT: movl (%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl 12(%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl 4(%ebx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 40(%esp,%eax), %edx
+; X86-NEXT: movl 44(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl 32(%esp,%eax), %edi
+; X86-NEXT: movl 36(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: shldl %cl, %edi, %ebx
; X86-NEXT: notl %ebx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl 16(%ebp), %eax
; X86-NEXT: movl (%eax), %eax
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %ebx, 8(%esi)
-; X86-NEXT: movl %ecx, 12(%esi)
-; X86-NEXT: movl %edi, (%esi)
-; X86-NEXT: movl %edx, 4(%esi)
-; X86-NEXT: je .LBB22_2
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: andl %ebx, 4(%eax)
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: notl %edi
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl $96, %ebx
+; X86-NEXT: shrl $3, %ebx
+; X86-NEXT: movl (%eax,%ebx), %ebx
+; X86-NEXT: andl %edi, (%eax)
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, 12(%eax)
+; X86-NEXT: notl %edx
+; X86-NEXT: andl %edx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: btl %ecx, %ebx
+; X86-NEXT: jae .LBB22_2
; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB22_2:
@@ -7242,52 +1890,75 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; SSE-NEXT: testb $64, %cl
; SSE-NEXT: cmovneq %rsi, %r8
; SSE-NEXT: cmovneq %rax, %rsi
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq 8(%rdi), %r9
-; SSE-NEXT: movq %r9, %r10
-; SSE-NEXT: andq %r8, %r10
; SSE-NEXT: notq %r8
-; SSE-NEXT: movq %rcx, %r11
-; SSE-NEXT: andq %rsi, %r11
; SSE-NEXT: notq %rsi
-; SSE-NEXT: andq %r9, %r8
-; SSE-NEXT: andq %rcx, %rsi
-; SSE-NEXT: orq %r10, %r11
-; SSE-NEXT: jne .LBB22_2
+; SSE-NEXT: movl %ecx, %r9d
+; SSE-NEXT: andl $96, %r9d
+; SSE-NEXT: shrl $3, %r9d
+; SSE-NEXT: movl (%rdi,%r9), %r9d
+; SSE-NEXT: btl %ecx, %r9d
+; SSE-NEXT: jb .LBB22_2
; SSE-NEXT: # %bb.1:
; SSE-NEXT: movl (%rdx), %eax
; SSE-NEXT: .LBB22_2:
-; SSE-NEXT: movq %rsi, (%rdi)
-; SSE-NEXT: movq %r8, 8(%rdi)
+; SSE-NEXT: andq %rsi, (%rdi)
+; SSE-NEXT: andq %r8, 8(%rdi)
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: retq
;
-; AVX-LABEL: reset_multiload_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: movl $1, %esi
-; AVX-NEXT: xorl %r8d, %r8d
-; AVX-NEXT: shldq %cl, %rsi, %r8
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: shlxq %rcx, %rsi, %r9
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %r9, %r8
-; AVX-NEXT: cmovneq %rax, %r9
-; AVX-NEXT: movq (%rdi), %r10
-; AVX-NEXT: movq 8(%rdi), %r11
-; AVX-NEXT: andnq %r11, %r8, %rcx
-; AVX-NEXT: andq %r8, %r11
-; AVX-NEXT: andnq %r10, %r9, %rsi
-; AVX-NEXT: andq %r9, %r10
-; AVX-NEXT: orq %r11, %r10
-; AVX-NEXT: jne .LBB22_2
-; AVX-NEXT: # %bb.1:
-; AVX-NEXT: movl (%rdx), %eax
-; AVX-NEXT: .LBB22_2:
-; AVX-NEXT: movq %rsi, (%rdi)
-; AVX-NEXT: movq %rcx, 8(%rdi)
-; AVX-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX-NEXT: retq
+; AVX2-LABEL: reset_multiload_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: movl $1, %r8d
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: shldq %cl, %r8, %rsi
+; AVX2-NEXT: shlxq %rcx, %r8, %r8
+; AVX2-NEXT: testb $64, %cl
+; AVX2-NEXT: cmovneq %r8, %rsi
+; AVX2-NEXT: cmovneq %rax, %r8
+; AVX2-NEXT: notq %rsi
+; AVX2-NEXT: notq %r8
+; AVX2-NEXT: movl %ecx, %r9d
+; AVX2-NEXT: andl $96, %r9d
+; AVX2-NEXT: shrl $3, %r9d
+; AVX2-NEXT: movl (%rdi,%r9), %r9d
+; AVX2-NEXT: btl %ecx, %r9d
+; AVX2-NEXT: jb .LBB22_2
+; AVX2-NEXT: # %bb.1:
+; AVX2-NEXT: movl (%rdx), %eax
+; AVX2-NEXT: .LBB22_2:
+; AVX2-NEXT: andq %r8, (%rdi)
+; AVX2-NEXT: andq %rsi, 8(%rdi)
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reset_multiload_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: movl $1, %r8d
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: shldq %cl, %r8, %rsi
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: shlxq %rcx, %r8, %r8
+; AVX512-NEXT: testb $64, %cl
+; AVX512-NEXT: cmovneq %r8, %rsi
+; AVX512-NEXT: cmovneq %rax, %r8
+; AVX512-NEXT: notq %rsi
+; AVX512-NEXT: notq %r8
+; AVX512-NEXT: movl %ecx, %r9d
+; AVX512-NEXT: andl $96, %r9d
+; AVX512-NEXT: shrl $3, %r9d
+; AVX512-NEXT: movl (%rdi,%r9), %r9d
+; AVX512-NEXT: btl %ecx, %r9d
+; AVX512-NEXT: jb .LBB22_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: movl (%rdx), %eax
+; AVX512-NEXT: .LBB22_2:
+; AVX512-NEXT: andq %r8, (%rdi)
+; AVX512-NEXT: andq %rsi, 8(%rdi)
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
diff --git a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
index 065710f..8576f8f 100644
--- a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
@@ -3,6 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-SINCOS-STRET
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-NOSINCOS-STRET
+
; TODO: The below RUN line will fails GISEL selection and will fallback to DAG selection due to lack of support for loads/stores in i686 mode, support is expected soon enough, for this reason the llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir test is added for now because of the lack of support for i686 in GlobalISel.
; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86
; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
@@ -34,6 +37,29 @@ define { float, float } @test_sincos_f32(float %Val) nounwind {
; X64-NEXT: popq %rax
; X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %rax
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT: popq %rax
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: popq %rax
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: test_sincos_f32:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: subl $28, %esp
@@ -93,6 +119,28 @@ define { double, double } @test_sincos_f64(double %Val) nounwind {
; X64-NEXT: addq $24, %rsp
; X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %rax
+; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT: popq %rax
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: subq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _sin
+; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT: callq _cos
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT: addq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: test_sincos_f64:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: subl $44, %esp
@@ -153,6 +201,40 @@ define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind {
; X64-NEXT: addq $56, %rsp
; X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp)
+; MACOS-SINCOS-STRET-NEXT: fld %st(0)
+; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT: callq _cosl
+; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT: callq _sinl
+; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: fxch %st(1)
+; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: subq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp)
+; MACOS-NOSINCOS-STRET-NEXT: fld %st(0)
+; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosl
+; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinl
+; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: fxch %st(1)
+; MACOS-NOSINCOS-STRET-NEXT: addq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: test_sincos_f80:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: subl $60, %esp
@@ -288,6 +370,57 @@ define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %
; SDAG-X64-NEXT: popq %r14
; SDAG-X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-SINCOS-STRET: ## %bb.0: ## %entry
+; MACOS-SINCOS-STRET-NEXT: pushq %r14
+; MACOS-SINCOS-STRET-NEXT: pushq %rbx
+; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movq %r14, %rdi
+; MACOS-SINCOS-STRET-NEXT: movq %rbx, %rsi
+; MACOS-SINCOS-STRET-NEXT: callq _foo
+; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%r14)
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%rbx)
+; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: popq %rbx
+; MACOS-SINCOS-STRET-NEXT: popq %r14
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-NOSINCOS-STRET: ## %bb.0: ## %entry
+; MACOS-NOSINCOS-STRET-NEXT: pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movq %r14, %rdi
+; MACOS-NOSINCOS-STRET-NEXT: movq %rbx, %rsi
+; MACOS-NOSINCOS-STRET-NEXT: callq _foo
+; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT: addq $8, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: popq %r14
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: can_fold_with_call_in_chain:
; GISEL-X86: # %bb.0: # %entry
; GISEL-X86-NEXT: pushl %ebx
diff --git a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
index 834dd78..9b02438 100644
--- a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
+++ b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
@@ -1,59 +1,213 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 5
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 | FileCheck --check-prefix=MACOS-SINCOS-STRET %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 | FileCheck --check-prefix=MACOS-NOSINCOS-STRET %s
define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v4f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $52, %esp
-; CHECK-NEXT: movl 84(%esp), %esi
-; CHECK-NEXT: flds 76(%esp)
-; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: flds 64(%esp)
-; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: flds 72(%esp)
-; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: flds 68(%esp)
-; CHECK-NEXT: movl 80(%esp), %edi
-; CHECK-NEXT: leal 40(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: leal 4(%edi), %eax
-; CHECK-NEXT: movl %eax, 4(%esp)
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: leal 44(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: leal 8(%edi), %eax
-; CHECK-NEXT: movl %eax, 4(%esp)
-; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: leal 36(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: movl %edi, 4(%esp)
-; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: leal 48(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: addl $12, %edi
-; CHECK-NEXT: movl %edi, 4(%esp)
-; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: flds 36(%esp)
-; CHECK-NEXT: flds 40(%esp)
-; CHECK-NEXT: flds 44(%esp)
-; CHECK-NEXT: flds 48(%esp)
-; CHECK-NEXT: fstps 12(%esi)
-; CHECK-NEXT: fstps 8(%esi)
-; CHECK-NEXT: fstps 4(%esi)
-; CHECK-NEXT: fstps (%esi)
-; CHECK-NEXT: addl $52, %esp
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: retl
+; X86-LABEL: test_sincos_v4f32:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $52, %esp
+; X86-NEXT: movl 84(%esp), %esi
+; X86-NEXT: flds 76(%esp)
+; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: flds 64(%esp)
+; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: flds 72(%esp)
+; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: flds 68(%esp)
+; X86-NEXT: movl 80(%esp), %edi
+; X86-NEXT: leal 40(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: leal 4(%edi), %eax
+; X86-NEXT: movl %eax, 4(%esp)
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: leal 44(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: leal 8(%edi), %eax
+; X86-NEXT: movl %eax, 4(%esp)
+; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: leal 36(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: movl %edi, 4(%esp)
+; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: leal 48(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: addl $12, %edi
+; X86-NEXT: movl %edi, 4(%esp)
+; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: flds 36(%esp)
+; X86-NEXT: flds 40(%esp)
+; X86-NEXT: flds 44(%esp)
+; X86-NEXT: flds 48(%esp)
+; X86-NEXT: fstps 12(%esi)
+; X86-NEXT: fstps 8(%esi)
+; X86-NEXT: fstps 4(%esi)
+; X86-NEXT: fstps (%esi)
+; X86-NEXT: addl $52, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: test_sincos_v4f32:
+; X64: # %bb.0:
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $56, %rsp
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X64-NEXT: leaq 4(%rsp), %rdi
+; X64-NEXT: movq %rsp, %rsi
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: leaq 12(%rsp), %rdi
+; X64-NEXT: leaq 8(%rsp), %rsi
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: leaq 28(%rsp), %rdi
+; X64-NEXT: leaq 24(%rsp), %rsi
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT: leaq 20(%rsp), %rdi
+; X64-NEXT: leaq 16(%rsp), %rsi
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT: movups %xmm1, (%r14)
+; X64-NEXT: movups %xmm0, (%rbx)
+; X64-NEXT: addq $56, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r14
+; X64-NEXT: retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %r14
+; MACOS-SINCOS-STRET-NEXT: pushq %rbx
+; MACOS-SINCOS-STRET-NEXT: subq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, %xmm1
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT: unpcklpd (%rsp), %xmm2 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: ## xmm2 = xmm2[0],mem[0]
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT: addq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT: popq %rbx
+; MACOS-SINCOS-STRET-NEXT: popq %r14
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: subq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT: addq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: popq %r14
+; MACOS-NOSINCOS-STRET-NEXT: retq
%result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x)
%result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
%result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
@@ -63,36 +217,120 @@ define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias
}
define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v2f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $52, %esp
-; CHECK-NEXT: movl 84(%esp), %esi
-; CHECK-NEXT: fldl 72(%esp)
-; CHECK-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; CHECK-NEXT: fldl 64(%esp)
-; CHECK-NEXT: movl 80(%esp), %edi
-; CHECK-NEXT: leal 24(%esp), %eax
-; CHECK-NEXT: movl %eax, 12(%esp)
-; CHECK-NEXT: movl %edi, 8(%esp)
-; CHECK-NEXT: fstpl (%esp)
-; CHECK-NEXT: calll sincos
-; CHECK-NEXT: leal 32(%esp), %eax
-; CHECK-NEXT: movl %eax, 12(%esp)
-; CHECK-NEXT: addl $8, %edi
-; CHECK-NEXT: movl %edi, 8(%esp)
-; CHECK-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
-; CHECK-NEXT: fstpl (%esp)
-; CHECK-NEXT: calll sincos
-; CHECK-NEXT: fldl 24(%esp)
-; CHECK-NEXT: fldl 32(%esp)
-; CHECK-NEXT: fstpl 8(%esi)
-; CHECK-NEXT: fstpl (%esi)
-; CHECK-NEXT: addl $52, %esp
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: retl
+; X86-LABEL: test_sincos_v2f64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $52, %esp
+; X86-NEXT: movl 84(%esp), %esi
+; X86-NEXT: fldl 72(%esp)
+; X86-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; X86-NEXT: fldl 64(%esp)
+; X86-NEXT: movl 80(%esp), %edi
+; X86-NEXT: leal 24(%esp), %eax
+; X86-NEXT: movl %eax, 12(%esp)
+; X86-NEXT: movl %edi, 8(%esp)
+; X86-NEXT: fstpl (%esp)
+; X86-NEXT: calll sincos
+; X86-NEXT: leal 32(%esp), %eax
+; X86-NEXT: movl %eax, 12(%esp)
+; X86-NEXT: addl $8, %edi
+; X86-NEXT: movl %edi, 8(%esp)
+; X86-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT: fstpl (%esp)
+; X86-NEXT: calll sincos
+; X86-NEXT: fldl 24(%esp)
+; X86-NEXT: fldl 32(%esp)
+; X86-NEXT: fstpl 8(%esi)
+; X86-NEXT: fstpl (%esi)
+; X86-NEXT: addl $52, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: test_sincos_v2f64:
+; X64: # %bb.0:
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $56, %rsp
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: leaq 24(%rsp), %rdi
+; X64-NEXT: leaq 16(%rsp), %rsi
+; X64-NEXT: callq sincos@PLT
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: leaq 8(%rsp), %rdi
+; X64-NEXT: movq %rsp, %rsi
+; X64-NEXT: callq sincos@PLT
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; X64-NEXT: movups %xmm1, (%r14)
+; X64-NEXT: movups %xmm0, (%rbx)
+; X64-NEXT: addq $56, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r14
+; X64-NEXT: retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %r14
+; MACOS-SINCOS-STRET-NEXT: pushq %rbx
+; MACOS-SINCOS-STRET-NEXT: subq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT: addq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT: popq %rbx
+; MACOS-SINCOS-STRET-NEXT: popq %r14
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: subq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cos
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cos
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sin
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sin
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT: addq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: popq %r14
+; MACOS-NOSINCOS-STRET-NEXT: retq
%result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x)
%result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
%result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1
diff --git a/llvm/test/DebugInfo/debug-bool-const-value.ll b/llvm/test/DebugInfo/debug-bool-const-value.ll
new file mode 100644
index 0000000..84cf993
--- /dev/null
+++ b/llvm/test/DebugInfo/debug-bool-const-value.ll
@@ -0,0 +1,29 @@
+; REQUIRES: object-emission
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: {{.*}}DW_TAG_variable
+; CHECK-NEXT: {{.*}} DW_AT_const_value (1)
+; CHECK-NEXT: {{.*}} DW_AT_name ("arg")
+
+define void @test() !dbg !5
+{
+entry:
+ call void @"llvm.dbg.value"(metadata i1 true, metadata !7, metadata !8), !dbg !6
+ ret void, !dbg !6
+}
+
+declare void @"llvm.dbg.value"(metadata %".1", metadata %".2", metadata %".3")
+
+!llvm.dbg.cu = !{ !2 }
+!llvm.module.flags = !{ !9, !10 }
+
+!1 = !DIFile(directory: "", filename: "test")
+!2 = distinct !DICompileUnit(emissionKind: FullDebug, file: !1, isOptimized: false, language: DW_LANG_C_plus_plus, runtimeVersion: 0)
+!3 = !DIBasicType(encoding: DW_ATE_boolean, name: "bool", size: 8)
+!4 = !DISubroutineType(types: !{null})
+!5 = distinct !DISubprogram(file: !1, isDefinition: true, isLocal: false, isOptimized: false, line: 5, linkageName: "test", name: "test", scope: !1, scopeLine: 5, type: !4, unit: !2)
+!6 = !DILocation(column: 1, line: 5, scope: !5)
+!7 = !DILocalVariable(arg: 0, file: !1, line: 5, name: "arg", scope: !5, type: !3)
+!8 = !DIExpression()
+!9 = !{ i32 2, !"Dwarf Version", i32 4 }
+!10 = !{ i32 2, !"Debug Info Version", i32 3 }
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index f5cb4b7..2661ed5 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -82,12 +82,18 @@
#CHECK: lxvprll 6, 2, 1
0x7c 0xc2 0x0c 0xda
+#CHECK: lxvpb32x 2, 15, 16
+0x7c,0x4f,0x86,0xda
+
#CHECK: stxvprl 0, 1, 2
0x7c 0x01 0x15 0x9a
#CHECK: stxvprll 6, 0, 1
0x7c 0xc0 0x0d 0xda
+#CHECK: stxvpb32x 2, 15, 16
+0x7c,0x4f,0x87,0xda
+
#CHECK: dmxvi8gerx4 1, 2, 4
0xec,0x82,0x20,0x58
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index f0df8ce..7fb8254 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -76,12 +76,18 @@
#CHECK: lxvprll 6, 2, 1
0xda 0x0c 0xc2 0x7c
+#CHECK: lxvpb32x 2, 15, 16
+0xda,0x86,0x4f,0x7c
+
#CHECK: stxvprl 0, 1, 2
0x9a 0x15 0x01 0x7c
#CHECK: stxvprll 6, 0, 1
0xda 0x0d 0xc0 0x7c
+#CHECK: stxvpb32x 2, 15, 16
+0xda,0x87,0x4f,0x7c
+
#CHECK: dmxvi8gerx4 1, 2, 4
0x58,0x20,0x82,0xec
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index bc0683e..40059c4 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -105,6 +105,10 @@
# CHECK-LE: lxvprll 6, 2, 1 # encoding: [0xda,0x0c,0xc2,0x7c]
lxvprll 6, 2, 1
+ lxvpb32x 2, 15, 16
+#CHECK-BE: lxvpb32x 2, 15, 16 # encoding: [0x7c,0x4f,0x86,0xda]
+#CHECK-LE: lxvpb32x 2, 15, 16 # encoding: [0xda,0x86,0x4f,0x7c]
+
# CHECK-BE: stxvprl 0, 1, 2 # encoding: [0x7c,0x01,0x15,0x9a]
# CHECK-LE: stxvprl 0, 1, 2 # encoding: [0x9a,0x15,0x01,0x7c]
stxvprl 0, 1, 2
@@ -113,6 +117,10 @@
# CHECK-LE: stxvprll 6, 0, 1 # encoding: [0xda,0x0d,0xc0,0x7c]
stxvprll 6, 0, 1
+ stxvpb32x 2, 15, 16
+#CHECK-BE: stxvpb32x 2, 15, 16 # encoding: [0x7c,0x4f,0x87,0xda]
+#CHECK-LE: stxvpb32x 2, 15, 16 # encoding: [0xda,0x87,0x4f,0x7c]
+
dmxvi8gerx4 1, 2, 4
# CHECK-BE: dmxvi8gerx4 1, 2, 4 # encoding: [0xec,0x82,0x20,0x58]
# CHECK-LE: dmxvi8gerx4 1, 2, 4 # encoding: [0x58,0x20,0x82,0xec]
diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
index 8a6f60b..87aed77 100644
--- a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
+++ b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll
@@ -184,6 +184,18 @@ define void @type_test(ptr %x) {
ret void
}
+define void @public_type_test(ptr %x) {
+; CHECK-LABEL: define void @public_type_test(
+; CHECK-SAME: ptr [[X:%.*]]) {
+; CHECK-NEXT: [[TEST:%.*]] = call i1 @llvm.public.type.test(ptr [[X]], metadata !"typeid")
+; CHECK-NEXT: call void @llvm.assume(i1 [[TEST]])
+; CHECK-NEXT: ret void
+;
+ %test = call i1 @llvm.public.type.test(ptr %x, metadata !"typeid")
+ call void @llvm.assume(i1 %test)
+ ret void
+}
+
define void @multiple_dead_conds(i32 %x) {
; CHECK-LABEL: define void @multiple_dead_conds(
; CHECK-SAME: i32 [[X:%.*]]) {
diff --git a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
index 14ee00d..2763860 100644
--- a/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
+++ b/llvm/test/Transforms/IndVarSimplify/loop-guard-order.ll
@@ -114,7 +114,7 @@ define i32 @urem_order1(i32 %n) {
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
; CHECK-NEXT: call void @foo()
-; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 3
+; CHECK-NEXT: [[IV_NEXT]] = add nuw i32 [[IV]], 3
; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP]]
; CHECK: [[EXIT_LOOPEXIT]]:
@@ -205,13 +205,12 @@ define i64 @test_loop_with_div_order_1(i64 %n) {
; CHECK-NEXT: [[PARITY_CHECK:%.*]] = icmp eq i64 [[IS_ODD]], 0
; CHECK-NEXT: br i1 [[PARITY_CHECK]], label %[[LOOP_PREHEADER:.*]], label %[[EXIT]]
; CHECK: [[LOOP_PREHEADER]]:
-; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[UPPER_BOUND]], i64 1)
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[LOOP_PREHEADER]] ]
; CHECK-NEXT: [[DUMMY:%.*]] = load volatile i64, ptr null, align 8
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UMAX]]
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], [[UPPER_BOUND]]
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[LOOP]], label %[[EXIT_LOOPEXIT:.*]]
; CHECK: [[EXIT_LOOPEXIT]]:
; CHECK-NEXT: br label %[[EXIT]]
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index 6b090e9..f61a197 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -2113,3 +2113,98 @@ define <4 x i32> @or_zext_nneg_minus_constant_splat(<4 x i8> %a) {
%or = or <4 x i32> %zext, splat (i32 -9)
ret <4 x i32> %or
}
+
+define i8 @or_positive_minus_non_positive_to_abs(i8 %a){
+; CHECK-LABEL: @or_positive_minus_non_positive_to_abs(
+; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.abs.i8(i8 [[A:%.*]], i1 false)
+; CHECK-NEXT: ret i8 [[TMP2]]
+;
+ %b = icmp sgt i8 %a, 0
+ %mask = sext i1 %b to i8
+ %neg = sub i8 0, %a
+ %mask_inv = xor i8 %mask, -1
+ %c = and i8 %neg, %mask_inv
+ %d = and i8 %a, %mask
+ %or = or i8 %c, %d
+ ret i8 %or
+}
+
+; TODO: Fold to smax https://alive2.llvm.org/ce/z/wDiDh2
+define i8 @or_select_smax_neg_to_abs(i8 %a){
+; CHECK-LABEL: @or_select_smax_neg_to_abs(
+; CHECK-NEXT: [[SGT0:%.*]] = icmp sgt i8 [[A:%.*]], 0
+; CHECK-NEXT: [[NEG:%.*]] = sub nsw i8 0, [[A]]
+; CHECK-NEXT: [[OR:%.*]] = select i1 [[SGT0]], i8 0, i8 [[NEG]]
+; CHECK-NEXT: ret i8 [[OR]]
+;
+ %sgt0 = icmp sgt i8 %a, 0
+ %neg = sub nsw i8 0, %a
+ %sel = select i1 %sgt0, i8 0, i8 %neg
+ ret i8 %sel
+}
+
+; TODO: Fold to abs https://alive2.llvm.org/ce/z/DybfHG
+define i8 @or_select_smax_smax_to_abs(i8 %a){
+; CHECK-LABEL: @or_select_smax_smax_to_abs(
+; CHECK-NEXT: [[NEG:%.*]] = sub nsw i8 0, [[A:%.*]]
+; CHECK-NEXT: [[SEL:%.*]] = call i8 @llvm.smax.i8(i8 [[NEG]], i8 0)
+; CHECK-NEXT: [[MAX:%.*]] = call i8 @llvm.smax.i8(i8 [[A]], i8 0)
+; CHECK-NEXT: [[OR:%.*]] = or i8 [[SEL]], [[MAX]]
+; CHECK-NEXT: ret i8 [[OR]]
+;
+ %neg = sub nsw i8 0, %a
+ %sel = call i8 @llvm.smax.i8(i8 %neg, i8 0)
+ %max = call i8 @llvm.smax.i8(i8 %a, i8 0)
+ %or = or i8 %sel, %max
+ ret i8 %or
+}
+
+declare i8 @llvm.abs.i8(i8, i1)
+declare <2 x i8> @llvm.abs.v2i8(<2 x i8>, i1)
+
+define <2 x i8> @or_sgt_select_smax_to_abs(<2 x i8> %a){
+; CHECK-LABEL: @or_sgt_select_smax_to_abs(
+; CHECK-NEXT: [[OR:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[A:%.*]], i1 false)
+; CHECK-NEXT: ret <2 x i8> [[OR]]
+;
+ %sgt0 = icmp sgt <2 x i8> %a, zeroinitializer
+ %neg = sub <2 x i8> zeroinitializer, %a
+ %sel = select <2 x i1> %sgt0, <2 x i8> zeroinitializer, <2 x i8> %neg
+ %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+ %or = or <2 x i8> %sel, %max
+ ret <2 x i8> %or
+}
+
+define <2 x i8> @or_slt_select_smax_to_abs(<2 x i8> %a){
+; CHECK-LABEL: @or_slt_select_smax_to_abs(
+; CHECK-NEXT: [[OR:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[A:%.*]], i1 false)
+; CHECK-NEXT: ret <2 x i8> [[OR]]
+;
+ %slt0 = icmp slt <2 x i8> %a, zeroinitializer
+ %neg = sub <2 x i8> zeroinitializer, %a
+ %sel = select <2 x i1> %slt0, <2 x i8> %neg, <2 x i8> zeroinitializer
+ %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+ %or = or <2 x i8> %sel, %max
+ ret <2 x i8> %or
+}
+
+; negative test - %d has multiple uses. %or is not folded to abs.
+
+define <2 x i8> @or_select_smax_multi_uses(<2 x i8> %a){
+; CHECK-LABEL: @or_select_smax_multi_uses(
+; CHECK-NEXT: [[B:%.*]] = icmp sgt <2 x i8> [[A:%.*]], zeroinitializer
+; CHECK-NEXT: [[NEG:%.*]] = sub <2 x i8> zeroinitializer, [[A]]
+; CHECK-NEXT: [[C:%.*]] = select <2 x i1> [[B]], <2 x i8> zeroinitializer, <2 x i8> [[NEG]]
+; CHECK-NEXT: [[D:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[A]], <2 x i8> zeroinitializer)
+; CHECK-NEXT: [[OR1:%.*]] = or <2 x i8> [[C]], [[D]]
+; CHECK-NEXT: [[OR:%.*]] = add <2 x i8> [[OR1]], [[D]]
+; CHECK-NEXT: ret <2 x i8> [[OR]]
+;
+ %sgt0 = icmp sgt <2 x i8> %a, zeroinitializer
+ %neg = sub <2 x i8> zeroinitializer, %a
+ %sel = select <2 x i1> %sgt0, <2 x i8> zeroinitializer, <2 x i8> %neg
+ %max = call <2 x i8> @llvm.smax.v2i8(<2 x i8> %a, <2 x i8> zeroinitializer)
+ %or = or <2 x i8> %sel, %max
+ %add = add <2 x i8> %or, %max
+ ret <2 x i8> %add
+}
diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
index 3d97048..8b3c050 100644
--- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
+++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
@@ -256,27 +256,27 @@ define <2 x i1> @not_logical_or2(i1 %b, <2 x i32> %a) {
ret <2 x i1> %and
}
-define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 {
; CHECK-LABEL: @bools_logical_commute0(
-; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF2]]
; CHECK-NEXT: ret i1 [[OR]]
;
%not = xor i1 %c, -1
- %and1 = select i1 %not, i1 %a, i1 false
- %and2 = select i1 %c, i1 %b, i1 false
- %or = select i1 %and1, i1 true, i1 %and2
+ %and1 = select i1 %not, i1 %a, i1 false, !prof!1
+ %and2 = select i1 %c, i1 %b, i1 false, !prof !2
+ %or = select i1 %and1, i1 true, i1 %and2, !prof !3
ret i1 %or
}
-define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 {
; CHECK-LABEL: @bools_logical_commute0_and1(
-; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF1]]
; CHECK-NEXT: ret i1 [[OR]]
;
%not = xor i1 %c, -1
%and1 = and i1 %not, %a
- %and2 = select i1 %c, i1 %b, i1 false
- %or = select i1 %and1, i1 true, i1 %and2
+ %and2 = select i1 %c, i1 %b, i1 false, !prof !1
+ %or = select i1 %and1, i1 true, i1 %and2, !prof !2
ret i1 %or
}
@@ -292,15 +292,15 @@ define i1 @bools_logical_commute0_and2(i1 %a, i1 %b, i1 %c) {
ret i1 %or
}
-define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 {
; CHECK-LABEL: @bools_logical_commute0_and1_and2(
-; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF3:![0-9]+]]
; CHECK-NEXT: ret i1 [[OR]]
;
%not = xor i1 %c, -1
%and1 = and i1 %not, %a
%and2 = and i1 %c, %b
- %or = select i1 %and1, i1 true, i1 %and2
+ %or = select i1 %and1, i1 true, i1 %and2, !prof !1
ret i1 %or
}
@@ -457,27 +457,27 @@ define i1 @bools_logical_commute3_and1_and2(i1 %b, i1 %c) {
ret i1 %or
}
-define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 {
; CHECK-LABEL: @bools2_logical_commute0(
-; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF1]]
; CHECK-NEXT: ret i1 [[OR]]
;
%not = xor i1 %c, -1
- %and1 = select i1 %c, i1 %a, i1 false
- %and2 = select i1 %not, i1 %b, i1 false
- %or = select i1 %and1, i1 true, i1 %and2
+ %and1 = select i1 %c, i1 %a, i1 false, !prof !1
+ %and2 = select i1 %not, i1 %b, i1 false, !prof !2
+ %or = select i1 %and1, i1 true, i1 %and2, !prof !3
ret i1 %or
}
-define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 {
; CHECK-LABEL: @bools2_logical_commute0_and1(
-; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF2]]
; CHECK-NEXT: ret i1 [[OR]]
;
%not = xor i1 %c, -1
%and1 = and i1 %c, %a
- %and2 = select i1 %not, i1 %b, i1 false
- %or = select i1 %and1, i1 true, i1 %and2
+ %and2 = select i1 %not, i1 %b, i1 false, !prof !1
+ %or = select i1 %and1, i1 true, i1 %and2, !prof !2
ret i1 %or
}
@@ -493,15 +493,15 @@ define i1 @bools2_logical_commute0_and2(i1 %a, i1 %b, i1 %c) {
ret i1 %or
}
-define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 {
; CHECK-LABEL: @bools2_logical_commute0_and1_and2(
-; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT: [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF3]]
; CHECK-NEXT: ret i1 [[OR]]
;
%not = xor i1 %c, -1
%and1 = and i1 %c, %a
%and2 = and i1 %not, %b
- %or = select i1 %and1, i1 true, i1 %and2
+ %or = select i1 %and1, i1 true, i1 %and2, !prof !1
ret i1 %or
}
@@ -799,8 +799,11 @@ define <2 x i1> @not_logical_and2(i1 %b, <2 x i32> %a) {
!0 = !{!"function_entry_count", i64 1000}
!1 = !{!"branch_weights", i32 2, i32 3}
+!2 = !{!"branch_weights", i32 5, i32 7}
+!3 = !{!"branch_weights", i32 11, i32 13}
;.
; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
; CHECK: [[PROF2]] = !{!"branch_weights", i32 3, i32 2}
+; CHECK: [[PROF3]] = !{!"unknown", !"instcombine"}
;.
diff --git a/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll
new file mode 100644
index 0000000..4d378b0
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll
@@ -0,0 +1,30 @@
+; Check that zeroed branch weights do not crash or otherwise break basic
+; LoopUnroll behavior when it tries to compute a probability from them.
+
+; RUN: opt < %s -S -unroll-count=2 -passes='loop-unroll' 2>&1 | FileCheck %s
+
+define void @test() {
+entry:
+ br label %loop
+
+loop:
+ br i1 false, label %end, label %loop, !prof !0
+
+end:
+ ret void
+}
+
+!0 = !{!"branch_weights", i32 0, i32 0}
+
+; CHECK: define void @test() {
+; CHECK: entry:
+; CHECK: br label %loop
+; CHECK: loop:
+; CHECK: br i1 false, label %end, label %loop.1, !prof !0
+; CHECK: loop.1:
+; CHECK: br i1 false, label %end, label %loop, !prof !0, !llvm.loop !1
+; CHECK-NOT: loop.2
+; CHECK: end:
+; CHECK: ret void
+; CHECK: }
+; CHECK: !0 = !{!"branch_weights", i32 0, i32 0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index 49f663f..62e248b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -1,12 +1,12 @@
; REQUIRES: asserts
-; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -disable-output %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
; Tests for printing VPlans that are enabled under AArch64
-define i32 @print_partial_reduction(ptr %a, ptr %b) {
+define i32 @print_partial_reduction(ptr %a, ptr %b) "target-features"="+neon,+dotprod" {
; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' {
; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF
; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
@@ -69,60 +69,37 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
; CHECK-NEXT: No successors
; CHECK-NEXT: }
; CHECK: VPlan 'Final VPlan for VF={8,16},UF={1}' {
+; CHECK-NEXT: Live-in ir<1024> = vector-trip-count
; CHECK-NEXT: Live-in ir<1024> = original trip-count
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.main.loop.iter.check>
+; CHECK-NEXT: Successor(s): vector.ph
; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.main.loop.iter.check>:
-; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<vector.ph>:
-; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4>
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: EMIT vp<%1> = reduction-start-vector ir<0>, ir<0>, ir<4>
; CHECK-NEXT: Successor(s): vector.body
; CHECK-EMPTY:
; CHECK-NEXT: vector.body:
-; CHECK-NEXT: EMIT-SCALAR vp<[[EP_IV:%.+]]> = phi [ ir<0>, ir-bb<vector.ph> ], [ vp<%index.next>, vector.body ]
-; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
-; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
+; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ]
+; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%accum> = phi vp<%1>, ir<%add> (VF scaled by 1/4)
+; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<%index>
; CHECK-NEXT: WIDEN ir<%load.a> = load ir<%gep.a>
-; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
+; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<%index>
; CHECK-NEXT: WIDEN ir<%load.b> = load ir<%gep.b>
; CHECK-NEXT: WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
; CHECK-NEXT: WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
-; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>
-; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024>
+; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<16>
+; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, ir<1024>
; CHECK-NEXT: Successor(s): middle.block, vector.body
; CHECK-EMPTY:
; CHECK-NEXT: middle.block:
-; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add>
-; CHECK-NEXT: EMIT branch-on-cond ir<true>
-; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph>
+; CHECK-NEXT: EMIT vp<%3> = compute-reduction-result ir<%accum>, ir<%add>
+; CHECK-NEXT: Successor(s): ir-bb<exit>
; CHECK-EMPTY:
; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RED_RESULT]]> from middle.block)
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<scalar.ph>:
-; CHECK-NEXT: EMIT-SCALAR vp<[[EP_RESUME:%.+]]> = phi [ ir<1024>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT: EMIT-SCALAR vp<[[EP_MERGE:%.+]]> = phi [ vp<[[RED_RESULT]]>, middle.block ], [ ir<0>, ir-bb<entry> ]
-; CHECK-NEXT: EMIT-SCALAR vp<%6> = resume-for-epilogue vp<%vec.epilog.resume.val>
-; CHECK-NEXT: Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT: IR %accum = phi i32 [ 0, %scalar.ph ], [ %add, %for.body ] (extra operand: vp<[[EP_MERGE]]> from ir-bb<scalar.ph>)
-; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv
-; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1
-; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i32
-; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv
-; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1
-; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i32
-; CHECK-NEXT: IR %mul = mul i32 %ext.b, %ext.a
-; CHECK-NEXT: IR %add = add i32 %mul, %accum
-; CHECK-NEXT: IR %iv.next = add i64 %iv, 1
-; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024
+; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<%3> from middle.block)
; CHECK-NEXT: No successors
; CHECK-NEXT: }
entry:
@@ -141,8 +118,12 @@ for.body: ; preds = %for.body, %entry
%add = add i32 %mul, %accum
%iv.next = add i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
- br i1 %exitcond.not, label %exit, label %for.body
+ br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0
exit:
ret i32 %add
}
+
+!0 = distinct !{!0, !2, !3}
+!2 = !{!"llvm.loop.interleave.count", i32 1}
+!3 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
index 9deab90..fe230fa 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -102,7 +102,7 @@ exit:
ret void
}
-define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
+define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr, i32 %z) optsize {
; CHECK-LABEL: sink_replicate_region_2
; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' {
; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF
@@ -125,16 +125,18 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
; CHECK-NEXT: EMIT vp<[[SPLICE:%.+]]> = first-order splice ir<%recur>, ir<%recur.next>
+; CHECK-NEXT: WIDEN ir<%cond> = icmp eq ir<%iv>, ir<%z>
+; CHECK-NEXT: EMIT vp<[[AND:%.+]]> = logical-and vp<[[MASK]]>, ir<%cond>
; CHECK-NEXT: Successor(s): pred.store
; CHECK-EMPTY:
; CHECK-NEXT: <xVFxUF> pred.store: {
; CHECK-NEXT: pred.store.entry:
-; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]>
+; CHECK-NEXT: BRANCH-ON-MASK vp<[[AND]]>
; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
; CHECK-EMPTY:
; CHECK-NEXT: pred.store.if:
-; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK-NEXT: REPLICATE ir<%rem> = srem vp<[[SPLICE]]>, ir<%x>
+; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
; CHECK-NEXT: REPLICATE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEPS]]>
; CHECK-NEXT: REPLICATE ir<%add> = add ir<%rem>, ir<%recur.next>
; CHECK-NEXT: REPLICATE store ir<%add>, ir<%gep>
@@ -143,9 +145,9 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
; CHECK-NEXT: pred.store.continue:
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): loop.0
+; CHECK-NEXT: Successor(s): if.1
; CHECK-EMPTY:
-; CHECK-NEXT: loop.0:
+; CHECK-NEXT: if.1:
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
@@ -162,13 +164,20 @@ entry:
br label %loop
loop:
- %recur = phi i32 [ 0, %entry ], [ %recur.next, %loop ]
- %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
- %rem = srem i32 %recur, %x
+ %recur = phi i32 [ 0, %entry ], [ %recur.next, %latch ]
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ]
%recur.next = sext i8 %y to i32
+ %cond = icmp eq i32 %iv, %z
+ br i1 %cond, label %if, label %latch
+
+if:
+ %rem = srem i32 %recur, %x
%add = add i32 %rem, %recur.next
%gep = getelementptr i32, ptr %ptr, i32 %iv
store i32 %add, ptr %gep
+ br label %latch
+
+latch:
%iv.next = add nsw i32 %iv, 1
%ec = icmp eq i32 %iv.next, 20001
br i1 %ec, label %exit, label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll
new file mode 100644
index 0000000..8615401
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/hoist-and-sink-mem-ops-with-invariant-pointers.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
+
+define void @hoist_invariant_load_noalias_due_to_memchecks(ptr %dst, ptr %invariant_ptr, i32 %n) {
+; CHECK-LABEL: define void @hoist_invariant_load_noalias_due_to_memchecks(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 4
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 4
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]]
+; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT: store <4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT: store i32 [[INV_VAL]], ptr [[GEP]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %inv_val = load i32, ptr %invariant_ptr, align 4
+ %gep = getelementptr inbounds i32, ptr %dst, i32 %iv
+ store i32 %inv_val, ptr %gep, align 4
+ %iv.next = add nuw nsw i32 %iv, 1
+ %ec = icmp eq i32 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; Test that loads with non-invariant addresses are not hoisted.
+define void @dont_hoist_variant_address(ptr %dst, ptr %src, i32 %n) {
+; CHECK-LABEL: define void @dont_hoist_variant_address(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[A1]], [[SRC2]]
+; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[INDEX]]
+; CHECK-NEXT: store <4 x i32> [[WIDE_LOAD]], ptr [[TMP2]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT: store i32 [[VAL]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.src = getelementptr inbounds i32, ptr %src, i32 %iv
+ %val = load i32, ptr %gep.src, align 4
+ %gep.dst = getelementptr inbounds i32, ptr %dst, i32 %iv
+ store i32 %val, ptr %gep.dst, align 4
+ %iv.next = add nuw nsw i32 %iv, 1
+ %ec = icmp eq i32 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; Test that predicated loads are not hoisted.
+define void @dont_hoist_predicated_load(ptr %dst, ptr %invariant_ptr, ptr %cond_ptr, i32 %n) {
+; CHECK-LABEL: define void @dont_hoist_predicated_load(
+; CHECK-SAME: ptr [[DST:%.*]], ptr [[INVARIANT_PTR:%.*]], ptr [[COND_PTR:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK: [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP20]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP22]], 4
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[COND_PTR]], i64 [[TMP3]]
+; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[INVARIANT_PTR]], i64 4
+; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[COND_PTR]], [[SCEVGEP]]
+; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT: [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[INVARIANT_PTR]], [[SCEVGEP]]
+; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE11:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[COND_PTR]], i32 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !alias.scope [[META11:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
+; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK: [[PRED_STORE_IF]]:
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT: store i32 [[TMP7]], ptr [[TMP9]], align 4, !alias.scope [[META16:![0-9]+]], !noalias [[META18:![0-9]+]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]]
+; CHECK: [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
+; CHECK-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; CHECK: [[PRED_STORE_IF6]]:
+; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]]
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP8]]
+; CHECK-NEXT: store i32 [[TMP11]], ptr [[TMP13]], align 4, !alias.scope [[META16]], !noalias [[META18]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE7]]
+; CHECK: [[PRED_STORE_CONTINUE7]]:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
+; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; CHECK: [[PRED_STORE_IF8]]:
+; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]]
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP12]]
+; CHECK-NEXT: store i32 [[TMP15]], ptr [[TMP17]], align 4, !alias.scope [[META16]], !noalias [[META18]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE9]]
+; CHECK: [[PRED_STORE_CONTINUE9]]:
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
+; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]]
+; CHECK: [[PRED_STORE_IF10]]:
+; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4, !alias.scope [[META14]]
+; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[TMP16]]
+; CHECK-NEXT: store i32 [[TMP19]], ptr [[TMP21]], align 4, !alias.scope [[META16]], !noalias [[META18]]
+; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE11]]
+; CHECK: [[PRED_STORE_CONTINUE11]]:
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT: [[GEP_COND:%.*]] = getelementptr inbounds i32, ptr [[COND_PTR]], i32 [[IV]]
+; CHECK-NEXT: [[COND:%.*]] = load i32, ptr [[GEP_COND]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[COND]], 0
+; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[LOOP_LATCH]]
+; CHECK: [[IF_THEN]]:
+; CHECK-NEXT: [[INV_VAL:%.*]] = load i32, ptr [[INVARIANT_PTR]], align 4
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT: store i32 [[INV_VAL]], ptr [[GEP]], align 4
+; CHECK-NEXT: br label %[[LOOP_LATCH]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %gep.cond = getelementptr inbounds i32, ptr %cond_ptr, i32 %iv
+ %cond = load i32, ptr %gep.cond, align 4
+ %cmp = icmp sgt i32 %cond, 0
+ br i1 %cmp, label %if.then, label %loop.latch
+
+if.then:
+ %inv_val = load i32, ptr %invariant_ptr, align 4
+ %gep = getelementptr inbounds i32, ptr %dst, i32 %iv
+ store i32 %inv_val, ptr %gep, align 4
+ br label %loop.latch
+
+loop.latch:
+ %iv.next = add nuw nsw i32 %iv, 1
+ %ec = icmp eq i32 %iv.next, %n
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
index 994e9c1..2dd6a04 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -29,11 +29,13 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION
; CHECK-NEXT: ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VF]]>
; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]>
+; CHECK-NEXT: WIDEN ir<%cond> = icmp eq ir<%iv>, ir<%x>
+; CHECK-NEXT: EMIT vp<[[AND:%.+]]> = logical-and vp<[[MASK]]>, ir<%cond>
; CHECK-NEXT: Successor(s): pred.store
; CHECK: <xVFxUF> pred.store: {
; CHECK-NEXT: pred.store.entry:
-; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]>
+; CHECK-NEXT: BRANCH-ON-MASK vp<[[AND]]>
; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue
; CHECK: pred.store.if:
@@ -50,24 +52,31 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; CHECK-NEXT: No successors
; CHECK-NEXT: }
-; CHECK: loop.1:
+; CHECK: if.1:
; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
; CHECK-NEXT: No successors
; CHECK-NEXT: }
;
-define void @sink1(i32 %k) {
+define void @sink1(i32 %k, i32 %x) {
entry:
br label %loop
loop:
- %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+ %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ]
+ %cond = icmp eq i32 %iv, %x
+ br i1 %cond, label %if, label %latch
+
+if:
%gep.b = getelementptr inbounds [2048 x i32], ptr @b, i32 0, i32 %iv
%lv.b = load i32, ptr %gep.b, align 4
%add = add i32 %lv.b, 10
%mul = mul i32 2, %add
%gep.a = getelementptr inbounds [2048 x i32], ptr @a, i32 0, i32 %iv
store i32 %mul, ptr %gep.a, align 4
+ br label %latch
+
+latch:
%iv.next = add i32 %iv, 1
%large = icmp sge i32 %iv, 8
%exitcond = icmp eq i32 %iv, %k
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll
new file mode 100644
index 0000000..a35bcf1
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-load-from-vector-loop.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -passes='default<O3>' -S %s | FileCheck %s
+
+target triple = "arm64-apple-macosx"
+
+%"class.dealii::VectorizedArray" = type { [4 x double] }
+
+define void @hoist_invariant_load(ptr %invariant_ptr, i64 %num_elements, ptr %array) {
+; CHECK-LABEL: define void @hoist_invariant_load(
+; CHECK-SAME: ptr readonly captures(none) [[INVARIANT_PTR:%.*]], i64 [[NUM_ELEMENTS:%.*]], ptr captures(none) [[ARRAY:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[NUM_ELEMENTS]], 0
+; CHECK-NEXT: br i1 [[CMP1_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH:.*]]
+; CHECK: [[LOOP_LATCH]]:
+; CHECK-NEXT: [[I2:%.*]] = phi i64 [ [[I_NEXT:%.*]], %[[LOOP_LATCH]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nusw %"class.dealii::VectorizedArray", ptr [[ARRAY]], i64 [[I2]]
+; CHECK-NEXT: [[INVARIANT_VAL:%.*]] = load double, ptr [[INVARIANT_PTR]], align 8
+; CHECK-NEXT: [[ARRAY_VAL:%.*]] = load double, ptr [[GEP]], align 8
+; CHECK-NEXT: [[SUM:%.*]] = fadd double [[INVARIANT_VAL]], [[ARRAY_VAL]]
+; CHECK-NEXT: store double [[SUM]], ptr [[GEP]], align 8
+; CHECK-NEXT: [[I_NEXT]] = add nuw i64 [[I2]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[I_NEXT]], [[NUM_ELEMENTS]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_LATCH]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %loop.header
+
+loop.header: ; preds = %loop.latch, %entry
+ %i = phi i64 [ 0, %entry ], [ %i.next, %loop.latch ]
+ %cmp = icmp ult i64 %i, %num_elements
+ br i1 %cmp, label %loop.latch, label %exit
+
+loop.latch: ; preds = %loop.header
+ %gep = getelementptr nusw %"class.dealii::VectorizedArray", ptr %array, i64 %i
+ %invariant_val = load double, ptr %invariant_ptr, align 8
+ %array_val = load double, ptr %gep, align 8
+ %sum = fadd double %array_val, %invariant_val
+ store double %sum, ptr %gep, align 8
+ %i.next = add i64 %i, 1
+ br label %loop.header
+
+exit: ; preds = %loop.header
+ ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll
new file mode 100644
index 0000000..590b0be
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S --mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@a = common global [100 x i64] zeroinitializer, align 64
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 1)
+; CHECK-NEXT: br i1 false, label %[[LOP_RHSCNT_I_PEEL:.*]], label %[[LAND_END_I_PEEL:.*]]
+; CHECK: [[LOP_RHSCNT_I_PEEL]]:
+; CHECK-NEXT: [[TMP4:%.*]] = or <2 x i64> [[TMP1]], <i64 1, i64 0>
+; CHECK-NEXT: br label %[[LAND_END_I_PEEL]]
+; CHECK: [[LAND_END_I_PEEL]]:
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP4]], %[[LOP_RHSCNT_I_PEEL]] ]
+; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+; CHECK-NEXT: ret void
+;
+entry:
+ %.promoted104.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8
+ %.promoted103.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+ %0 = add i64 %.promoted104.i, 1
+ %1 = add i64 %.promoted103.i, 1
+ %2 = add i64 %0, 1
+ br i1 false, label %lop.rhscnt.i.peel, label %land.end.i.peel
+
+lop.rhscnt.i.peel:
+ %3 = or i64 %1, 1
+ br label %land.end.i.peel
+
+land.end.i.peel:
+ %4 = phi i64 [ %2, %entry ], [ %0, %lop.rhscnt.i.peel ]
+ %5 = phi i64 [ %1, %entry ], [ %3, %lop.rhscnt.i.peel ]
+ store i64 %5, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+ store i64 %4, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8
+ ret void
+}
diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll
new file mode 100644
index 0000000..42f9519
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s
+
+; Structurize as usual, but don't tear callbr and its destination blocks apart.
+;
+; Note: currently, callbr blocks and their corresponding target blocks
+; themselves are not handled by the structurizer.* If the CFG turns out to be
+; unstructured at the end, the CFG lowering (si-annotate-control-flow) will
+; detect this. For the currently intended use cases of callbr in the context of
+; the AMDGPU backend, this is not a limitation (cf.
+; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087).
+;
+; Note 2: while callbr and its targets remain untouched, everything else is
+; handled as usual, even if it is nested in a callbr region.
+;
+; *FIXME: this will be fixed in the future. Callbr can be handled as follows:
+; Input IR:
+; ```
+; define void @foo_callbr() {
+; callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...]
+; fallthrough:
+; br label %exit
+; indirect:
+; br label %exit
+; ...
+; exit:
+; ret void
+; }
+; ```
+;
+; Output IR:
+; ```
+; define void @foo_callbr() {
+; callbr void asm "", "!i"()
+; to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...]
+; fake.indirect: ; preds = %0
+; br label %Flow
+; fake.indirect1: ; preds = %0
+; br label %Flow
+; fake.indirect2: ; preds = %0
+; br label %Flow
+; ...
+; Flow: ; preds = %fallthrough, %fake.indirect[0-N]
+; %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ]
+; br i1 %1, label %indirect, label %Flow1
+; Flow1: ; preds = %Flow, %indirect
+; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ]
+; br i1 %2, label %indirect1, label %Flow2
+; Flow2: ; preds = %Flow, %indirect1
+; %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ]
+; br i1 %2, label %indirect2, label %Flow3
+; ...
+; fallthrough: ; preds = %0
+; br label %Flow
+; indirect: ; preds = %Flow
+; br label %Flow1
+; indirect1: ; preds = %Flow1
+; br label %Flow2
+; indirect2: : preds = %Flow2
+; br label %Flow3
+; ...
+; exit: ; preds = %indirectN, %FlowN
+; ret void
+; }
+; ```
+;
+; Output IR as ASCII-art:
+; %0
+; ---------------------
+; | | | |
+; v v v v
+; f f.i f.i1 f.i2
+; | | | |
+; v v v v
+; ---------------------
+; %Flow
+; | \
+; | %indirect
+; | /
+; %Flow1
+; | \
+; | %indirect1
+; | /
+; %Flow2
+; | \
+; | %indirect2
+; | /
+; %exit
+;
+
+; Only callbr, nothing to do.
+define void @callbr_simple() {
+; CHECK-LABEL: define void @callbr_simple() {
+; CHECK-NEXT: [[CALLBR:.*:]]
+; CHECK-NEXT: callbr void asm "", "!i"()
+; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK: [[INDIRECT]]:
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[INDIRECT1:.*:]]
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+callbr:
+ callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+ br label %exit
+indirect:
+ br label %exit
+exit:
+ ret void
+}
+
+; Callbr nested in non-callbr: non-callbr is transformed
+define void @callbr_in_non_callbr(i1 %c) {
+; CHECK-LABEL: define void @callbr_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]]
+; CHECK: [[FLOW]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]]
+; CHECK: [[CALLBR]]:
+; CHECK-NEXT: callbr void asm "", "!i"()
+; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK: [[INDIRECT]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[INDIRECT1:.*:]]
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[NOCALLBR]]:
+; CHECK-NEXT: br label %[[FLOW]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+ br i1 %c, label %callbr, label %nocallbr
+callbr:
+ callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+ br label %exit
+indirect:
+ br label %exit
+nocallbr:
+ br label %exit
+exit:
+ ret void
+}
+
+; Callbr parent of non-callbr: non-callbr is transformed
+define void @non_callbr_in_callbr(i1 %c) {
+; CHECK-LABEL: define void @non_callbr_in_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT: callbr void asm "", "!i"()
+; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK: [[INDIRECT]]:
+; CHECK-NEXT: br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]]
+; CHECK: [[FLOW]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ]
+; CHECK-NEXT: br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]]
+; CHECK: [[FALLTHROUGH1]]:
+; CHECK-NEXT: br label %[[FLOW1]]
+; CHECK: [[FALLTHROUGH2]]:
+; CHECK-NEXT: br label %[[FLOW]]
+; CHECK: [[INDIRECT1:.*:]]
+; CHECK-NEXT: br label %[[EXIT:.*]]
+; CHECK: [[FLOW1]]:
+; CHECK-NEXT: br label %[[EXIT]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+ callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+ br i1 %c, label %fallthrough1, label %fallthrough2
+fallthrough1:
+ br label %exit
+fallthrough2:
+ br label %exit
+indirect:
+ br label %exit
+exit:
+ ret void
+}
+
+; Callbr surrounded by non-callbr: all three regular branches are handled
+; correctly
+define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) {
+; CHECK-LABEL: define void @callbr_nested_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) {
+; CHECK-NEXT: [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT: br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]]
+; CHECK: [[FLOW3]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT: br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]]
+; CHECK: [[CALLBR]]:
+; CHECK-NEXT: callbr void asm "", "!i"()
+; CHECK-NEXT: to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK: [[INDIRECT]]:
+; CHECK-NEXT: br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]]
+; CHECK: [[FALLTHROUGH1]]:
+; CHECK-NEXT: br label %[[FLOW2]]
+; CHECK: [[INDIRECT2:.*:]]
+; CHECK-NEXT: br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]]
+; CHECK: [[INDIRECT1]]:
+; CHECK-NEXT: br label %[[FLOW1]]
+; CHECK: [[NOCALLBR]]:
+; CHECK-NEXT: br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]]
+; CHECK: [[NOCALLBR1]]:
+; CHECK-NEXT: br label %[[FLOW]]
+; CHECK: [[FLOW]]:
+; CHECK-NEXT: br label %[[FLOW3]]
+; CHECK: [[FLOW1]]:
+; CHECK-NEXT: br label %[[RET]]
+; CHECK: [[FLOW2]]:
+; CHECK-NEXT: br label %[[RET]]
+; CHECK: [[RET]]:
+; CHECK-NEXT: ret void
+;
+ br i1 %c, label %callbr, label %nocallbr
+callbr:
+ callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+ br i1 %d, label %fallthrough1, label %ret
+fallthrough1:
+ br label %ret
+indirect:
+ br i1 %e, label %indirect1, label %ret
+indirect1:
+ br label %ret
+nocallbr:
+ br i1 %f, label %nocallbr1, label %ret
+nocallbr1:
+ br label %ret
+ret:
+ ret void
+}
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 11a5a57..cadf781 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -57,8 +57,13 @@ if config.enable_profcheck:
# so we just exclude llvm-reduce tests from this config altogether. This should
# be fine though as profcheck config tests are mostly concerned with opt.
config.excludes.append("llvm-reduce")
+ # Exclude llvm-objcopy tests - not the target of this effort, and some use
+ # cat in ways that conflict with how profcheck uses it.
+ config.excludes.append("llvm-objcopy")
# (Issue #161235) Temporarily exclude LoopVectorize.
config.excludes.append("LoopVectorize")
+ # exclude UpdateTestChecks - they fail because of inserted prof annotations
+ config.excludes.append("UpdateTestChecks")
# test_source_root: The root path where tests are located.
config.test_source_root = os.path.dirname(__file__)
@@ -474,7 +479,7 @@ if config.host_ldflags.find("-m32") < 0 and any(
config.available_features.add("host-byteorder-" + sys.byteorder + "-endian")
if config.target_triple:
if re.match(
- r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|s390x|s390|tce|thumbeb)-.*",
+ r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|sparc64|s390x|s390|tce|thumbeb)-.*",
config.target_triple,
):
config.available_features.add("target-byteorder-big-endian")
diff --git a/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test
new file mode 100644
index 0000000..00141f12
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test
@@ -0,0 +1,33 @@
+# RUN: dsymutil -include-swiftmodules-from-interface -verbose -oso-prepend-path=%p -y -o %t.dSYM %s | FileCheck %s
+#
+# RUN: dsymutil -include-swiftmodules-from-interface --linker parallel -verbose -oso-prepend-path=%p -y %s -o %t-parallel.dSYM | FileCheck %s
+#
+# To regenerate:
+# echo ''>I.swift
+# echo ''>B.swift
+# echo 'import I'>main.swift
+# xcrun swiftc -emit-module-interface-path I.swiftinterface -enable-library-evolution I.swift
+# xcrun swiftc -emit-module-path B.swiftmodule B.swift -Xfrontend -no-serialize-debugging-options
+# xcrun swiftc -explicit-module-build main.swift -I. -module-cache-path cache -g -Xfrontend -no-serialize-debugging-options
+# output is "B.swiftmodule" and "cache/I*.swiftmodule"
+#
+# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/Binary.swiftmodule
+# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/FromInterface.swiftmodule
+
+#
+---
+triple: 'arm64-apple-darwin'
+objects:
+ - filename: '../Inputs/Binary.swiftmodule'
+ timestamp: 0
+ type: 50
+ symbols: []
+ - filename: '../Inputs/FromInterface.swiftmodule'
+ timestamp: 0
+ type: 50
+ symbols: []
+ - filename: '../Inputs/FromInterface.swiftmodule'
+ timestamp: 0
+ type: 50
+ symbols: []
+...
diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test
index 1574fe3..0b0bce1 100644
--- a/llvm/test/tools/dsymutil/cmdline.test
+++ b/llvm/test/tools/dsymutil/cmdline.test
@@ -14,6 +14,7 @@ CHECK: -fat64
CHECK: -flat
CHECK: -gen-reproducer
CHECK: -help
+CHECK: -include-swiftmodules-from-interface
CHECK: -keep-function-for-static
CHECK: -no-object-timestamp
CHECK: -no-odr