aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/X86')
-rw-r--r--llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll20
-rw-r--r--llvm/test/CodeGen/X86/3addr-16bit.ll48
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll18
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll3
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll3
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/fp-bitcast.ll47
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll14
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/select-copy.mir136
-rw-r--r--llvm/test/CodeGen/X86/O0-pipeline.ll4
-rw-r--r--llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir2
-rw-r--r--llvm/test/CodeGen/X86/addcarry.ll29
-rw-r--r--llvm/test/CodeGen/X86/amx-tf32-internal.ll7
-rw-r--r--llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll12
-rwxr-xr-xllvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll122
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll136
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir165
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir153
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_copy.mir97
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll87
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll61
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir134
-rw-r--r--llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir113
-rw-r--r--llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll371
-rw-r--r--llvm/test/CodeGen/X86/apx/compress-evex.mir8
-rw-r--r--llvm/test/CodeGen/X86/apx/no-rex2-general.ll122
-rw-r--r--llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll29
-rw-r--r--llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll31
-rw-r--r--llvm/test/CodeGen/X86/apx/no-rex2-special.ll113
-rw-r--r--llvm/test/CodeGen/X86/apx/setzucc.ll12
-rw-r--r--llvm/test/CodeGen/X86/atomic-rm-bit-test.ll22
-rw-r--r--llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll3
-rw-r--r--llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll96
-rw-r--r--llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll2
-rw-r--r--llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll18
-rw-r--r--llvm/test/CodeGen/X86/avx10_2bf16-arith.ll4
-rw-r--r--llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll24
-rw-r--r--llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll12
-rw-r--r--llvm/test/CodeGen/X86/avx2-arith.ll11
-rw-r--r--llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll14
-rw-r--r--llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll16
-rw-r--r--llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir26
-rw-r--r--llvm/test/CodeGen/X86/avx512-mask-set-opt.ll229
-rw-r--r--llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir30
-rw-r--r--llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll22
-rw-r--r--llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll58
-rw-r--r--llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll60
-rw-r--r--llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll28
-rw-r--r--llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll44
-rw-r--r--llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll24
-rw-r--r--llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll185
-rw-r--r--llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll72
-rw-r--r--llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll39
-rw-r--r--llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll93
-rw-r--r--llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll14
-rw-r--r--llvm/test/CodeGen/X86/basic-block-sections-list.ll62
-rw-r--r--llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll8
-rw-r--r--llvm/test/CodeGen/X86/bfloat.ll743
-rw-r--r--llvm/test/CodeGen/X86/bitcast-vector-bool.ll32
-rw-r--r--llvm/test/CodeGen/X86/bitcnt-big-integer.ll6272
-rw-r--r--llvm/test/CodeGen/X86/bittest-big-integer.ll7582
-rw-r--r--llvm/test/CodeGen/X86/build-vector-128.ll228
-rw-r--r--llvm/test/CodeGen/X86/build-vector-256.ll5
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll3
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-assembly.ll3
-rw-r--r--llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-loc.mir4
-rw-r--r--llvm/test/CodeGen/X86/chain_order.ll3
-rw-r--r--llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll4
-rw-r--r--llvm/test/CodeGen/X86/combine-fceil.ll193
-rw-r--r--llvm/test/CodeGen/X86/combine-fcmp.ll330
-rw-r--r--llvm/test/CodeGen/X86/combine-ffloor.ll193
-rw-r--r--llvm/test/CodeGen/X86/combine-fnearbyint.ll193
-rw-r--r--llvm/test/CodeGen/X86/combine-frint.ll193
-rw-r--r--llvm/test/CodeGen/X86/combine-fround.ll419
-rw-r--r--llvm/test/CodeGen/X86/combine-froundeven.ll193
-rw-r--r--llvm/test/CodeGen/X86/combine-fsqrt.ll174
-rw-r--r--llvm/test/CodeGen/X86/combine-ftrunc.ll193
-rw-r--r--llvm/test/CodeGen/X86/combine-icmp.ll905
-rw-r--r--llvm/test/CodeGen/X86/combine-mul.ll2
-rw-r--r--llvm/test/CodeGen/X86/combine-rcp.ll65
-rw-r--r--llvm/test/CodeGen/X86/combine-rndscale.ll162
-rw-r--r--llvm/test/CodeGen/X86/combine-rsqrt.ll65
-rw-r--r--llvm/test/CodeGen/X86/combine-sub-usat.ll63
-rw-r--r--llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll4
-rw-r--r--llvm/test/CodeGen/X86/dag-combine-counter.ll2
-rw-r--r--llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll55
-rw-r--r--llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll68
-rw-r--r--llvm/test/CodeGen/X86/discriminate-mem-ops.ll55
-rw-r--r--llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll3
-rw-r--r--llvm/test/CodeGen/X86/fmaxnum.ll45
-rw-r--r--llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll382
-rw-r--r--llvm/test/CodeGen/X86/fminnum.ll45
-rw-r--r--llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll162
-rw-r--r--llvm/test/CodeGen/X86/fold-loop-of-urem.ll81
-rw-r--r--llvm/test/CodeGen/X86/freeze-binary.ll26
-rw-r--r--llvm/test/CodeGen/X86/gfni-shifts.ll139
-rw-r--r--llvm/test/CodeGen/X86/haddsubsat.ll101
-rw-r--r--llvm/test/CodeGen/X86/icmp-abs-C.ll22
-rw-r--r--llvm/test/CodeGen/X86/insert-prefetch-inline.afdo4
-rw-r--r--llvm/test/CodeGen/X86/insert-prefetch-inline.ll76
-rw-r--r--llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo2
-rw-r--r--llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll41
-rw-r--r--llvm/test/CodeGen/X86/insert-prefetch-other.afdo3
-rw-r--r--llvm/test/CodeGen/X86/insert-prefetch.afdo3
-rw-r--r--llvm/test/CodeGen/X86/insert-prefetch.ll101
-rw-r--r--llvm/test/CodeGen/X86/ipra-reg-usage.ll4
-rw-r--r--llvm/test/CodeGen/X86/isel-arg-attrs.ll23
-rw-r--r--llvm/test/CodeGen/X86/isel-icmp.ll178
-rw-r--r--llvm/test/CodeGen/X86/isel-llvm.sincos.ll133
-rw-r--r--llvm/test/CodeGen/X86/kmov.ll44
-rw-r--r--llvm/test/CodeGen/X86/ldexp-avx512.ll312
-rw-r--r--llvm/test/CodeGen/X86/llc-accept-avx10-512.ll97
-rw-r--r--llvm/test/CodeGen/X86/llc-fp-contract-warning.ll12
-rw-r--r--llvm/test/CodeGen/X86/llvm.sincos.vec.ll404
-rw-r--r--llvm/test/CodeGen/X86/llvm.sincospi.ll233
-rw-r--r--llvm/test/CodeGen/X86/loop-strength-reduce5.ll10
-rw-r--r--llvm/test/CodeGen/X86/madd.ll22
-rw-r--r--llvm/test/CodeGen/X86/masked_gather_scatter.ll234
-rw-r--r--llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll130
-rw-r--r--llvm/test/CodeGen/X86/masked_store_trunc_usat.ll109
-rw-r--r--llvm/test/CodeGen/X86/matrix-multiply.ll74
-rw-r--r--llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll445
-rw-r--r--llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll273
-rw-r--r--llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll225
-rw-r--r--llvm/test/CodeGen/X86/midpoint-int-vec-128.ll241
-rw-r--r--llvm/test/CodeGen/X86/midpoint-int-vec-256.ll324
-rw-r--r--llvm/test/CodeGen/X86/midpoint-int.ll28
-rw-r--r--llvm/test/CodeGen/X86/min-legal-vector-width.ll106
-rw-r--r--llvm/test/CodeGen/X86/mmx-arith.ll3
-rw-r--r--llvm/test/CodeGen/X86/mul-constant-i16.ll8
-rw-r--r--llvm/test/CodeGen/X86/mul-constant-i32.ll16
-rw-r--r--llvm/test/CodeGen/X86/mul-constant-i8.ll4
-rw-r--r--llvm/test/CodeGen/X86/narrow-add-i64.ll94
-rw-r--r--llvm/test/CodeGen/X86/oddshuffles.ll86
-rw-r--r--llvm/test/CodeGen/X86/opt-pipeline.ll4
-rw-r--r--llvm/test/CodeGen/X86/optimize-max-0.ll211
-rw-r--r--llvm/test/CodeGen/X86/parity.ll30
-rw-r--r--llvm/test/CodeGen/X86/pmul.ll223
-rw-r--r--llvm/test/CodeGen/X86/pr114360.ll1
-rw-r--r--llvm/test/CodeGen/X86/pr165755.ll26
-rw-r--r--llvm/test/CodeGen/X86/pr166058.ll15
-rw-r--r--llvm/test/CodeGen/X86/pr166534.ll88
-rw-r--r--llvm/test/CodeGen/X86/pr166744.ll66
-rw-r--r--llvm/test/CodeGen/X86/pr167793.ll30
-rw-r--r--llvm/test/CodeGen/X86/pr168594.ll22
-rw-r--r--llvm/test/CodeGen/X86/pr169205.ll23
-rw-r--r--llvm/test/CodeGen/X86/pr49451.ll6
-rw-r--r--llvm/test/CodeGen/X86/pr63790.ll26
-rw-r--r--llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll76
-rw-r--r--llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll58
-rw-r--r--llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll9
-rw-r--r--llvm/test/CodeGen/X86/regalloc-fp.ll775
-rw-r--r--llvm/test/CodeGen/X86/rotate-extract.ll4
-rw-r--r--llvm/test/CodeGen/X86/rounding-ops.ll8
-rw-r--r--llvm/test/CodeGen/X86/scatter-schedule.ll4
-rw-r--r--llvm/test/CodeGen/X86/setcc-wide-types.ll432
-rw-r--r--llvm/test/CodeGen/X86/shift-i512.ll2206
-rw-r--r--llvm/test/CodeGen/X86/smul_fix.ll8
-rw-r--r--llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll42
-rw-r--r--llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll30
-rw-r--r--llvm/test/CodeGen/X86/srem-vector-lkk.ll372
-rw-r--r--llvm/test/CodeGen/X86/sshl_sat.ll40
-rw-r--r--llvm/test/CodeGen/X86/sshl_sat_vec.ll113
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll40
-rw-r--r--llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll24
-rw-r--r--llvm/test/CodeGen/X86/stackmap.ll9
-rw-r--r--llvm/test/CodeGen/X86/strictfp-inlineasm.ll27
-rw-r--r--llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll210
-rw-r--r--llvm/test/CodeGen/X86/twoaddr-lea.ll2
-rw-r--r--llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll166
-rw-r--r--llvm/test/CodeGen/X86/umul_fix.ll8
-rw-r--r--llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll19
-rw-r--r--llvm/test/CodeGen/X86/urem-vector-lkk.ll360
-rw-r--r--llvm/test/CodeGen/X86/ushl_sat.ll28
-rw-r--r--llvm/test/CodeGen/X86/ushl_sat_vec.ll111
-rw-r--r--llvm/test/CodeGen/X86/vector-compress-freeze.ll36
-rw-r--r--llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll434
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-fshl-sub128.ll25
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-256.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-512.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll16
-rw-r--r--llvm/test/CodeGen/X86/vector-fshr-sub128.ll47
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll6
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll6
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll6
-rw-r--r--llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll6
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll14
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll126
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll202
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll484
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll436
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll846
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll460
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll26
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll72
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll82
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll258
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll420
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll474
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll448
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll34
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll170
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll302
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll604
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll678
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll1083
-rw-r--r--llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll991
-rw-r--r--llvm/test/CodeGen/X86/vector-mul.ll28
-rw-r--r--llvm/test/CodeGen/X86/vector-mulfix-legalize.ll34
-rw-r--r--llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll84
-rw-r--r--llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll160
-rw-r--r--llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll38
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-ashr-512.ll40
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll70
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-lshr-512.ll22
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll94
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-128.ll12
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-256.ll20
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-512.ll27
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll31
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll24
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll37
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-combining.ll89
-rw-r--r--llvm/test/CodeGen/X86/vpternlog.ll (renamed from llvm/test/CodeGen/X86/issue163738.ll)12
-rw-r--r--llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll38794
-rw-r--r--llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll1344
-rw-r--r--llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll328
-rw-r--r--llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll280
-rw-r--r--llvm/test/CodeGen/X86/x86-shrink-wrapping.ll6
-rw-r--r--llvm/test/CodeGen/X86/x87-stack-pop.mir3
233 files changed, 44190 insertions, 39026 deletions
diff --git a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
index 1962dde..f2b4c49 100644
--- a/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
+++ b/llvm/test/CodeGen/X86/2012-01-10-UndefExceptionEdge.ll
@@ -36,10 +36,10 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: testb %al, %al
-; CHECK-NEXT: Ltmp0:
+; CHECK-NEXT: Ltmp0: ## EH_LABEL
; CHECK-NEXT: ## implicit-def: $ebx
; CHECK-NEXT: calll __Znam
-; CHECK-NEXT: Ltmp1:
+; CHECK-NEXT: Ltmp1: ## EH_LABEL
; CHECK-NEXT: ## %bb.1: ## %bb11
; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: movb $1, %al
@@ -58,13 +58,13 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
; CHECK-NEXT: jne LBB0_9
; CHECK-NEXT: ## %bb.10: ## %bb41
; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1
-; CHECK-NEXT: Ltmp2:
+; CHECK-NEXT: Ltmp2: ## EH_LABEL
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl %esi, (%esp)
; CHECK-NEXT: calll _Pjii
-; CHECK-NEXT: Ltmp3:
+; CHECK-NEXT: Ltmp3: ## EH_LABEL
; CHECK-NEXT: ## %bb.11: ## %bb42
; CHECK-NEXT: ## in Loop: Header=BB0_8 Depth=1
; CHECK-NEXT: xorl %eax, %eax
@@ -126,20 +126,20 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
; CHECK-NEXT: decl {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
; CHECK-NEXT: jmp LBB0_8
; CHECK-NEXT: LBB0_18: ## %bb43
-; CHECK-NEXT: Ltmp5:
+; CHECK-NEXT: Ltmp5: ## EH_LABEL
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: calll _OnOverFlow
-; CHECK-NEXT: Ltmp6:
+; CHECK-NEXT: Ltmp6: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_3
; CHECK-NEXT: LBB0_2: ## %bb29
-; CHECK-NEXT: Ltmp7:
+; CHECK-NEXT: Ltmp7: ## EH_LABEL
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: calll _OnOverFlow
-; CHECK-NEXT: Ltmp8:
+; CHECK-NEXT: Ltmp8: ## EH_LABEL
; CHECK-NEXT: LBB0_3: ## %bb30
; CHECK-NEXT: ud2
; CHECK-NEXT: LBB0_4: ## %bb20.loopexit
-; CHECK-NEXT: Ltmp4:
+; CHECK-NEXT: Ltmp4: ## EH_LABEL
; CHECK-NEXT: LBB0_9:
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: LBB0_6: ## %bb23
@@ -151,7 +151,7 @@ define void @f(ptr nocapture %arg, ptr nocapture %arg1, ptr nocapture %arg2, ptr
; CHECK-NEXT: popl %ebp
; CHECK-NEXT: retl
; CHECK-NEXT: LBB0_5: ## %bb20.loopexit.split-lp
-; CHECK-NEXT: Ltmp9:
+; CHECK-NEXT: Ltmp9: ## EH_LABEL
; CHECK-NEXT: jmp LBB0_6
; CHECK-NEXT: Lfunc_end0:
bb:
diff --git a/llvm/test/CodeGen/X86/3addr-16bit.ll b/llvm/test/CodeGen/X86/3addr-16bit.ll
index c9390d9..2b692bf 100644
--- a/llvm/test/CodeGen/X86/3addr-16bit.ll
+++ b/llvm/test/CodeGen/X86/3addr-16bit.ll
@@ -10,27 +10,27 @@ define zeroext i16 @test1(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
; X64-LABEL: test1:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl %esi, %eax
-; X64-NEXT: incl %eax
-; X64-NEXT: cmpw %di, %si
+; X64-NEXT: incl %esi
+; X64-NEXT: cmpw %di, %ax
; X64-NEXT: jne LBB0_2
; X64-NEXT: ## %bb.1: ## %bb
; X64-NEXT: pushq %rbx
-; X64-NEXT: movzwl %ax, %ebx
+; X64-NEXT: movzwl %si, %ebx
; X64-NEXT: movl %ebx, %edi
; X64-NEXT: callq _foo
; X64-NEXT: movl %ebx, %eax
; X64-NEXT: popq %rbx
; X64-NEXT: retq
; X64-NEXT: LBB0_2: ## %bb1
-; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: movzwl %si, %eax
; X64-NEXT: retq
;
; X86-LABEL: test1:
; X86: ## %bb.0: ## %entry
; X86-NEXT: pushl %esi
; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: incl %eax
; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx
; X86-NEXT: jne LBB0_2
@@ -63,27 +63,27 @@ define zeroext i16 @test2(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
; X64-LABEL: test2:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl %esi, %eax
-; X64-NEXT: decl %eax
-; X64-NEXT: cmpw %di, %si
+; X64-NEXT: decl %esi
+; X64-NEXT: cmpw %di, %ax
; X64-NEXT: jne LBB1_2
; X64-NEXT: ## %bb.1: ## %bb
; X64-NEXT: pushq %rbx
-; X64-NEXT: movzwl %ax, %ebx
+; X64-NEXT: movzwl %si, %ebx
; X64-NEXT: movl %ebx, %edi
; X64-NEXT: callq _foo
; X64-NEXT: movl %ebx, %eax
; X64-NEXT: popq %rbx
; X64-NEXT: retq
; X64-NEXT: LBB1_2: ## %bb1
-; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: movzwl %si, %eax
; X64-NEXT: retq
;
; X86-LABEL: test2:
; X86: ## %bb.0: ## %entry
; X86-NEXT: pushl %esi
; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: decl %eax
; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx
; X86-NEXT: jne LBB1_2
@@ -118,27 +118,27 @@ define zeroext i16 @test3(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
; X64-LABEL: test3:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl %esi, %eax
-; X64-NEXT: addl $2, %eax
-; X64-NEXT: cmpw %di, %si
+; X64-NEXT: addl $2, %esi
+; X64-NEXT: cmpw %di, %ax
; X64-NEXT: jne LBB2_2
; X64-NEXT: ## %bb.1: ## %bb
; X64-NEXT: pushq %rbx
-; X64-NEXT: movzwl %ax, %ebx
+; X64-NEXT: movzwl %si, %ebx
; X64-NEXT: movl %ebx, %edi
; X64-NEXT: callq _foo
; X64-NEXT: movl %ebx, %eax
; X64-NEXT: popq %rbx
; X64-NEXT: retq
; X64-NEXT: LBB2_2: ## %bb1
-; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: movzwl %si, %eax
; X64-NEXT: retq
;
; X86-LABEL: test3:
; X86: ## %bb.0: ## %entry
; X86-NEXT: pushl %esi
; X86-NEXT: subl $8, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl $2, %eax
; X86-NEXT: cmpw {{[0-9]+}}(%esp), %cx
; X86-NEXT: jne LBB2_2
@@ -171,19 +171,19 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
; X64-LABEL: test4:
; X64: ## %bb.0: ## %entry
; X64-NEXT: movl %esi, %eax
-; X64-NEXT: addl %edi, %eax
-; X64-NEXT: cmpw %di, %si
+; X64-NEXT: addl %edi, %esi
+; X64-NEXT: cmpw %di, %ax
; X64-NEXT: jne LBB3_2
; X64-NEXT: ## %bb.1: ## %bb
; X64-NEXT: pushq %rbx
-; X64-NEXT: movzwl %ax, %ebx
+; X64-NEXT: movzwl %si, %ebx
; X64-NEXT: movl %ebx, %edi
; X64-NEXT: callq _foo
; X64-NEXT: movl %ebx, %eax
; X64-NEXT: popq %rbx
; X64-NEXT: retq
; X64-NEXT: LBB3_2: ## %bb1
-; X64-NEXT: movzwl %ax, %eax
+; X64-NEXT: movzwl %si, %eax
; X64-NEXT: retq
;
; X86-LABEL: test4:
@@ -191,8 +191,8 @@ define zeroext i16 @test4(i16 zeroext %c, i16 zeroext %k) nounwind ssp {
; X86-NEXT: pushl %esi
; X86-NEXT: subl $8, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: cmpw %cx, %dx
; X86-NEXT: jne LBB3_2
diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
index 06cf968..8a8e7a3 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -297,30 +297,30 @@ define dso_local void @test6(i16 signext %0) nounwind {
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movl $buf, %ecx
-; CHECK-NEXT: movl $32, %edx
-; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: xorl %ecx, %ecx
+; CHECK-NEXT: movl $buf, %edx
+; CHECK-NEXT: movl $32, %esi
; CHECK-NEXT: jmp .LBB5_1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB5_3: # %if.false
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: decl %esi
+; CHECK-NEXT: decl %eax
; CHECK-NEXT: .LBB5_4: # %loop.bb2
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: leal (%rdi,%rsi), %r8d
+; CHECK-NEXT: leal (%rdi,%rax), %r8d
; CHECK-NEXT: movw %r8w, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: cmpw $7, %si
+; CHECK-NEXT: cmpw $7, %ax
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx)
+; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
; CHECK-NEXT: jne .LBB5_5
; CHECK-NEXT: .LBB5_1: # %loop.bb1
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: jne .LBB5_3
; CHECK-NEXT: # %bb.2: # %if.true
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
-; CHECK-NEXT: incl %esi
+; CHECK-NEXT: incl %eax
; CHECK-NEXT: jmp .LBB5_4
; CHECK-NEXT: .LBB5_5: # %exit
; CHECK-NEXT: tilerelease
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
index 87059c5..6ae7b22 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 {
; CHECK-LABEL: @test_no_bitcast(
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
index 5fb2dcd..ca7c357 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) {
; CHECK-LABEL: @test_amx_load_non_O0(
diff --git a/llvm/test/CodeGen/X86/GlobalISel/fp-bitcast.ll b/llvm/test/CodeGen/X86/GlobalISel/fp-bitcast.ll
new file mode 100644
index 0000000..dad33ca
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/fp-bitcast.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -global-isel | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -global-isel -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -global-isel -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+
+define half @test_i16_to_half(i16 %0) {
+; SSE2-LABEL: test_i16_to_half:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_i16_to_half:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovd %edi, %xmm0
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_i16_to_half:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovd %edi, %xmm0
+; AVX512-NEXT: retq
+entry:
+ %2 = bitcast i16 %0 to half
+ ret half %2
+}
+
+define i16 @test_half_to_i16(half %0) {
+; SSE2-LABEL: test_half_to_i16:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: # kill: def $eax killed $eax def $ax
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_half_to_i16:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: # kill: def $eax killed $eax def $ax
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_half_to_i16:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovd %xmm0, %eax
+; AVX512-NEXT: # kill: def $eax killed $eax def $ax
+; AVX512-NEXT: retq
+entry:
+ %2 = bitcast half %0 to i16
+ ret i16 %2
+}
diff --git a/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll
new file mode 100644
index 0000000..841c9a6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK
+
+define void @test_reloc_none() {
+; CHECK-LABEL: test_reloc_none:
+; CHECK: # %bb.0:
+; CHECK-NEXT: .Lreloc_none0:
+; CHECK-NEXT: .reloc .Lreloc_none0, BFD_RELOC_NONE, foo
+; CHECK-NEXT: retq
+ call void @llvm.reloc.none(metadata !"foo")
+ ret void
+}
+
+declare void @llvm.reloc.none(metadata)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
index 41e1b5b..5c059a4 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -1,5 +1,6 @@
-# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
-# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86
+# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64
--- |
@@ -30,24 +31,23 @@
...
---
name: test_copy
-# ALL-LABEL: name: test_copy
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# ALL: %0:gr8 = COPY $al
-# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT: $eax = COPY %1
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; CHECK-LABEL: name: test_copy
+ ; CHECK: liveins: $eax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+ ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+ ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s8) = COPY $al
%1(s32) = G_ZEXT %0(s8)
$eax = COPY %1(s32)
@@ -56,24 +56,23 @@ body: |
...
---
name: test_copy2
-# ALL-LABEL: name: test_copy2
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# ALL: %0:gr8 = COPY $al
-# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT: $eax = COPY %1
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; CHECK-LABEL: name: test_copy2
+ ; CHECK: liveins: $eax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+ ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+ ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s8) = COPY $al
%1(s32) = G_ZEXT %0(s8)
$eax = COPY %1(s32)
@@ -82,30 +81,35 @@ body: |
...
---
name: test_copy3
-# ALL-LABEL: name: test_copy3
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] }
-# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] }
-# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr16 = COPY $ax
-# X32-NEXT: %3:gr16_abcd = COPY %0
-# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; X86-LABEL: name: test_copy3
+ ; X86: liveins: $eax
+ ; X86-NEXT: {{ $}}
+ ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+ ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]]
+ ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+ ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+ ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X86-NEXT: RET 0, implicit $eax
+ ;
+ ; X64-LABEL: name: test_copy3
+ ; X64: liveins: $eax
+ ; X64-NEXT: {{ $}}
+ ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+ ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+ ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X64-NEXT: RET 0, implicit $eax
%0(s16) = COPY $ax
%1(s8) = G_TRUNC %0(s16)
%2(s32) = G_ZEXT %1(s8)
@@ -115,27 +119,25 @@ body: |
...
---
name: test_copy4
-# ALL-LABEL: name: test_copy4
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr32 = COPY $eax
-# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT: %2:gr32 = MOVZX32rr16 %1
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; CHECK-LABEL: name: test_copy4
+ ; CHECK: liveins: $eax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]]
+ ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s32) = COPY $eax
%1(s16) = G_TRUNC %0(s32)
%2(s32) = G_ZEXT %1(s16)
@@ -145,30 +147,35 @@ body: |
...
---
name: test_copy5
-# ALL-LABEL: name: test_copy5
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] }
-# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] }
-# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr32 = COPY $edx
-# X32-NEXT: %3:gr32_abcd = COPY %0
-# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax,$edx
+ ; X86-LABEL: name: test_copy5
+ ; X86: liveins: $eax, $edx
+ ; X86-NEXT: {{ $}}
+ ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]]
+ ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+ ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+ ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X86-NEXT: RET 0, implicit $eax
+ ;
+ ; X64-LABEL: name: test_copy5
+ ; X64: liveins: $eax, $edx
+ ; X64-NEXT: {{ $}}
+ ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+ ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X64-NEXT: RET 0, implicit $eax
%0(s32) = COPY $edx
%1(s8) = G_TRUNC %0(s32)
%2(s32) = G_ANYEXT %1(s8)
@@ -178,29 +185,26 @@ body: |
...
---
name: test_copy6
-# ALL-LABEL: name: test_copy6
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr32 = COPY $edx
-# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT: %3:low32_addr_access_rbp = IMPLICIT_DEF
-# ALL-NEXT: %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax,$edx
+ ; CHECK-LABEL: name: test_copy6
+ ; CHECK: liveins: $eax, $edx
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF
+ ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit
+ ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s32) = COPY $edx
%1(s16) = G_TRUNC %0(s32)
%2(s32) = G_ANYEXT %1(s16)
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 0fbfb42..9223348 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -7,9 +7,11 @@
; CHECK-LABEL: Pass Arguments:
; CHECK-NEXT: Target Library Information
+; CHECK-NEXT: Runtime Library Function Analysis
; CHECK-NEXT: Target Pass Configuration
; CHECK-NEXT: Machine Module Information
; CHECK-NEXT: Target Transform Information
+; CHECK-NEXT: Library Function Lowering Analysis
; CHECK-NEXT: Create Garbage Collector Module Metadata
; CHECK-NEXT: Assumption Cache Tracker
; CHECK-NEXT: Profile summary info
@@ -68,8 +70,6 @@
; CHECK-NEXT: X86 Indirect Branch Tracking
; CHECK-NEXT: X86 vzeroupper inserter
; CHECK-NEXT: Compressing EVEX instrs when possible
-; CHECK-NEXT: X86 Discriminate Memory Operands
-; CHECK-NEXT: X86 Insert Cache Prefetches
; CHECK-NEXT: X86 insert wait instruction
; CHECK-NEXT: Contiguously Lay Out Funclets
; CHECK-NEXT: Remove Loads Into Fake Uses
diff --git a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
index 348a290..2445306 100644
--- a/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
+++ b/llvm/test/CodeGen/X86/StackColoring-dbg-invariance.mir
@@ -55,7 +55,7 @@
!9 = !DILocalVariable(name: "4", scope: !5, file: !1, line: 4, type: !10)
!10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned)
!11 = !DILocation(line: 4, column: 1, scope: !5)
- !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+ !12 = distinct !DISubprogram(name: "test_2", linkageName: "test_2", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !7)
...
---
diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll
index 97894db..ee44820 100644
--- a/llvm/test/CodeGen/X86/addcarry.ll
+++ b/llvm/test/CodeGen/X86/addcarry.ll
@@ -1513,3 +1513,32 @@ define i1 @pr84831(i64 %arg) {
%trunc = trunc i63 %or to i1
ret i1 %trunc
}
+
+define void @pr169691(ptr %p0, i64 %implicit, i1 zeroext %carry) {
+; CHECK-LABEL: pr169691:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addb $-1, %dl
+; CHECK-NEXT: adcq %rsi, (%rdi)
+; CHECK-NEXT: adcq %rsi, 8(%rdi)
+; CHECK-NEXT: retq
+ %a0 = load i64, ptr %p0, align 8
+ %uaddo0 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a0, i64 %implicit)
+ %uaddo0.1 = extractvalue { i64, i1 } %uaddo0, 1
+ %uaddo0.0 = extractvalue { i64, i1 } %uaddo0, 0
+ %zextc = zext i1 %carry to i64
+ %uaddo0b = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %uaddo0.0, i64 %zextc)
+ %uaddo0b.1 = extractvalue { i64, i1 } %uaddo0b, 1
+ %uaddo0b.0 = extractvalue { i64, i1 } %uaddo0b, 0
+ %carry0 = or i1 %uaddo0.1, %uaddo0b.1
+ store i64 %uaddo0b.0, ptr %p0, align 8
+
+ %p1 = getelementptr inbounds nuw i8, ptr %p0, i64 8
+ %a1 = load i64, ptr %p1, align 8
+ %uaddo1 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a1, i64 %implicit)
+ %uaddo1.0 = extractvalue { i64, i1 } %uaddo1, 0
+ %zext0 = zext i1 %carry0 to i64
+ %uaddo1b = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %uaddo1.0, i64 %zext0)
+ %uaddo1b.0 = extractvalue { i64, i1 } %uaddo1b, 0
+ store i64 %uaddo1b.0, ptr %p1, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/amx-tf32-internal.ll b/llvm/test/CodeGen/X86/amx-tf32-internal.ll
index 6d0f3c5..caf7a1c 100644
--- a/llvm/test/CodeGen/X86/amx-tf32-internal.ll
+++ b/llvm/test/CodeGen/X86/amx-tf32-internal.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+avx512f, \
-; RUN: -mattr=+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s
+; RUN: -mattr=+amx-tf32 -verify-machineinstrs | FileCheck %s
define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
; CHECK-LABEL: test_amx:
@@ -20,7 +20,6 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
; CHECK-NEXT: tilezero %tmm1
; CHECK-NEXT: tilezero %tmm2
; CHECK-NEXT: tmmultf32ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: ttmmultf32ps %tmm1, %tmm0, %tmm2
; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx)
; CHECK-NEXT: tilerelease
; CHECK-NEXT: vzeroupper
@@ -31,9 +30,8 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
%c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
%c1 = call x86_amx @llvm.x86.tmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
- %c2 = call x86_amx @llvm.x86.ttmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2)
+ call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c1)
ret void
}
@@ -43,4 +41,3 @@ declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
declare x86_amx @llvm.x86.tmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
diff --git a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
index af1a7ae..642c1b7 100644
--- a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32 -verify-machineinstrs | FileCheck %s
define void @test_tmmultf32ps() {
; CHECK-LABEL: test_tmmultf32ps:
@@ -11,13 +11,3 @@ define void @test_tmmultf32ps() {
}
declare void @llvm.x86.tmmultf32ps(i8 %A, i8 %B, i8 %C)
-define void @test_ttmmultf32ps() {
-; CHECK-LABEL: test_ttmmultf32ps:
-; CHECK: # %bb.0:
-; CHECK-NEXT: ttmmultf32ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: retq
- call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3)
- ret void
-}
-declare void @llvm.x86.ttmmultf32ps(i8 %A, i8 %B, i8 %C)
-
diff --git a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
deleted file mode 100755
index 1f5758c..0000000
--- a/llvm/test/CodeGen/X86/amx_movrs_transpose_intrinsics.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O0
-; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs | FileCheck %s --check-prefixes=CHECK,O2
-; RUN: llc < %s -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-transpose,+amx-movrs,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
-
-define void @test_amx(i64 %stride, i8* %addr1) #0 {
-; CHECK-LABEL: test_amx:
-; CHECK: # %bb.0:
-; CHECK-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0
-; CHECK-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2
-; CHECK-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0
-; CHECK-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx:
-; EGPR: # %bb.0:
-; EGPR-NEXT: t2rpntlvwz0rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x04,0x3e]
-; EGPR-NEXT: t2rpntlvwz0rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x14,0x3e]
-; EGPR-NEXT: t2rpntlvwz1rs (%rsi,%rdi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x04,0x3e]
-; EGPR-NEXT: t2rpntlvwz1rst1 (%rsi,%rdi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x14,0x3e]
-; EGPR-NEXT: retq # encoding: [0xc3]
- call void @llvm.x86.t2rpntlvwz0rs(i8 1, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz0rst1(i8 2, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz1rs(i8 1, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz1rst1(i8 2, i8* %addr1, i64 %stride)
- ret void
-}
-declare void @llvm.x86.t2rpntlvwz0rs(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz0rst1(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz1rs(i8 , i8* , i64 )
-declare void @llvm.x86.t2rpntlvwz1rst1(i8 , i8* , i64 )
-
-define void @test_amx2(i8* %base, i64 %stride) #0 {
-; O0-LABEL: test_amx2:
-; O0: # %bb.0:
-; O0-NEXT: xorps %xmm0, %xmm0
-; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movb $1, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw $8, %ax
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
-; O0-NEXT: movw $8, %ax
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4
-; O0-NEXT: movw $8, %ax
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: # implicit-def: $al
-; O0-NEXT: movb %al, -{{[0-9]+}}(%rsp)
-; O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; O0-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; O0-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
-; O0-NEXT: tilerelease
-; O0-NEXT: retq
-;
-; O2-LABEL: test_amx2:
-; O2: # %bb.0:
-; O2-NEXT: xorps %xmm0, %xmm0
-; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movb $1, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; O2-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; O2-NEXT: movw $8, %ax
-; O2-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4
-; O2-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4
-; O2-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4
-; O2-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4
-; O2-NEXT: tilerelease
-; O2-NEXT: retq
-;
-; EGPR-LABEL: test_amx2:
-; EGPR: # %bb.0:
-; EGPR-NEXT: xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
-; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xc0]
-; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xd0]
-; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xe0]
-; EGPR-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x11,0x44,0x24,0xf0]
-; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x08,0x00]
-; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
-; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT: t2rpntlvwz0rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf8,0x24,0x37]
-; EGPR-NEXT: t2rpntlvwz0rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x78,0xf9,0x24,0x37]
-; EGPR-NEXT: t2rpntlvwz1rs (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf8,0x24,0x37]
-; EGPR-NEXT: t2rpntlvwz1rst1 (%rdi,%rsi), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe5,0x79,0xf9,0x24,0x37]
-; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT: retq # encoding: [0xc3]
- call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- ret void
-}
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rs.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0rst1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rs.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1rst1.internal(i16, i16, i16, i8*, i64)
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
deleted file mode 100644
index 4f41410..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_O2_to_O0.ll
+++ /dev/null
@@ -1,136 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-; RUN: -mattr=+amx-transpose -verify-machineinstrs | FileCheck %s
-
-@buf = dso_local global [2048 x i8] zeroinitializer, align 16
-@buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
-define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-; CHECK-LABEL: test_tile_2rpntlvwz0:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset %rbp, -16
-; CHECK-NEXT: movq %rsp, %rbp
-; CHECK-NEXT: .cfi_def_cfa_register %rbp
-; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
-; CHECK-NEXT: subq $8192, %rsp # imm = 0x2000
-; CHECK-NEXT: .cfi_offset %rbx, -24
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # kill: def $dx killed $dx killed $edx
-; CHECK-NEXT: movw %si, %cx
-; CHECK-NEXT: movw %di, %ax
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $cl
-; CHECK-NEXT: movb %cl, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: # implicit-def: $al
-; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw %dx, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movl $buf, %esi
-; CHECK-NEXT: movl $32, %edi
-; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdi), %tmm4
-; CHECK-NEXT: movabsq $64, %rbx
-; CHECK-NEXT: tilestored %tmm5, (%rsp,%rbx) # 1024-byte Folded Spill
-; CHECK-NEXT: tileloadd (%rsp,%rbx), %tmm0 # 1024-byte Folded Reload
-; CHECK-NEXT: movabsq $64, %rbx
-; CHECK-NEXT: tilestored %tmm4, 1024(%rsp,%rbx) # 1024-byte Folded Spill
-; CHECK-NEXT: tileloadd 1024(%rsp,%rbx), %tmm1 # 1024-byte Folded Reload
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tilestored %tmm1, (%rsi,%rdi)
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT: tilezero %tmm0
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm1
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm2
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
-; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tilestored %tmm0, (%rsi,%rdi)
-; CHECK-NEXT: movl $64, %edi
-; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: tileloadd (%rsi,%rdi), %tmm0
-; CHECK-NEXT: movl $buf2, %edx
-; CHECK-NEXT: movl $32, %esi
-; CHECK-NEXT: tilestored %tmm0, (%rdx,%rsi)
-; CHECK-NEXT: leaq -8(%rbp), %rsp
-; CHECK-NEXT: popq %rbx
-; CHECK-NEXT: popq %rbp
-; CHECK-NEXT: .cfi_def_cfa %rsp, 8
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-entry:
- %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #3
- %1 = extractvalue { x86_amx, x86_amx } %0, 0
- %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #3
- %3 = extractvalue { x86_amx, x86_amx } %0, 1
- %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #3
- %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #3
- %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #3
- %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #3
- %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #3
- %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #3
- %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #3
- %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #3
- %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #3
- tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #3
- ret void
-}
-
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
-
-declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
-declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
-declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
-declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
-declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
-
-attributes #0 = { nounwind uwtable "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose" }
-attributes #1 = { argmemonly nofree nounwind readonly }
-attributes #2 = { nofree nosync nounwind readnone }
-attributes #3 = { nounwind }
-attributes #4 = { argmemonly nounwind writeonly }
-
-!llvm.module.flags = !{!0, !1, !2}
-
-!0 = !{i32 1, !"wchar_size", i32 4}
-!1 = !{i32 7, !"uwtable", i32 2}
-!2 = !{i32 7, !"frame-pointer", i32 2}
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
deleted file mode 100644
index ab12ab3..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O0.mir
+++ /dev/null
@@ -1,165 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=fasttileconfig -o - %s | FileCheck %s
-
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers: []
-liveins:
- - { reg: '$edi', virtual-reg: '' }
- - { reg: '$esi', virtual-reg: '' }
- - { reg: '$edx', virtual-reg: '' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 1024
- adjustsStack: false
- hasCalls: true
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack:
- - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 1, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 2, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 3, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 4, name: '', type: default, offset: 0, size: 64, alignment: 4,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 5, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 6, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 7, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $rdi, $rsi, $rdx, $rax
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: renamable $zmm0 = AVX512_512_SET0
- ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
- ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
- ; CHECK-NEXT: renamable $rcx = MOV32ri64 64
- ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
- ; CHECK-NEXT: renamable $cx = MOV16ri 64
- ; CHECK-NEXT: MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
- ; CHECK-NEXT: renamable $cx = MOV16ri 16
- ; CHECK-NEXT: renamable $r8w = MOV16ri 16
- ; CHECK-NEXT: MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 50, $noreg, $al :: (store (s512) into %stack.4 + 50, align 2, basealign 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 20, $noreg, $cx :: (store (s512) into %stack.4 + 20, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 49, $noreg, $al :: (store (s512) into %stack.4 + 49, align 1, basealign 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 18, $noreg, $di :: (store (s512) into %stack.4 + 18, align 2, basealign 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 48, $noreg, $al :: (store (s512) into %stack.4 + 48, align 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 16, $noreg, $cx :: (store (s512) into %stack.4 + 16, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 52, $noreg, $al :: (store (s512) into %stack.4 + 52, align 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 24, $noreg, $cx :: (store (s512) into %stack.4 + 24, align 4)
- ; CHECK-NEXT: $al = IMPLICIT_DEF
- ; CHECK-NEXT: MOV8mr %stack.4, 1, $noreg, 53, $noreg, $al :: (store (s512) into %stack.4 + 53, align 1, basealign 4)
- ; CHECK-NEXT: MOV16mr %stack.4, 1, $noreg, 26, $noreg, $di :: (store (s512) into %stack.4 + 26, align 2, basealign 4)
- ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
- ; CHECK-NEXT: renamable $r9 = COPY $rsi
- ; CHECK-NEXT: $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
- ; CHECK-NEXT: renamable $r8 = COPY $rdi
- ; CHECK-NEXT: $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
- ; CHECK-NEXT: renamable $r10 = COPY $rax
- ; CHECK-NEXT: $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
- ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
- ; CHECK-NEXT: renamable $tmm0 = COPY renamable $tmm5
- ; CHECK-NEXT: renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
- ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
- ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
- ; CHECK-NEXT: renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
- ; CHECK-NEXT: PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
- ; CHECK-NEXT: renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
- ; CHECK-NEXT: renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
- ; CHECK-NEXT: renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
- ; CHECK-NEXT: renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
- ; CHECK-NEXT: PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
- renamable $zmm0 = AVX512_512_SET0
- VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, killed renamable $zmm0 :: (store (s512) into %stack.4, align 4)
- MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
- renamable $rcx = MOV32ri64 64
- MOV64mr %stack.7, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.7)
- renamable $cx = MOV16ri 64
- MOV16mr %stack.5, 1, $noreg, 0, $noreg, $cx :: (store (s16) into %stack.5)
- renamable $cx = MOV16ri 16
- renamable $r8w = MOV16ri 16
- MOV16mr %stack.6, 1, $noreg, 0, $noreg, $r8w :: (store (s16) into %stack.6)
- PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.4, align 4)
- renamable $r9 = COPY $rsi
- $rsi = MOV64rm %stack.7, 1, $noreg, 0, $noreg :: (load (s64) from %stack.7)
- renamable $r8 = COPY $rdi
- $di = MOV16rm %stack.6, 1, $noreg, 0, $noreg :: (load (s16) from %stack.6)
- renamable $r10 = COPY $rax
- $ax = MOV16rm %stack.5, 1, $noreg, 0, $noreg :: (load (s16) from %stack.5)
- renamable $tmm4_tmm5 = PT2RPNTLVWZ0V renamable $ax, renamable $cx, renamable $di, renamable $rdx, 1, killed renamable $r10, 0, $noreg
- renamable $tmm0 = COPY renamable $tmm5
- renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
- PTILESTOREDV renamable $ax, renamable $cx, renamable $r9, 1, renamable $rsi, 0, $noreg, killed renamable $tmm1
- PTILESTOREDV renamable $ax, renamable $di, renamable $r8, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
- renamable $tmm0 = PTILEZEROV renamable $ax, renamable $cx
- PTILESTOREDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg, killed renamable $tmm0
- renamable $tmm0 = PTILELOADDV renamable $ax, renamable $cx, killed renamable $r9, 1, renamable $rsi, 0, $noreg
- renamable $tmm1 = PTILELOADDV renamable $ax, renamable $di, killed renamable $r8, 1, renamable $rsi, 0, $noreg
- renamable $tmm2 = PTILELOADDV renamable $ax, renamable $cx, renamable $rdx, 1, renamable $rsi, 0, $noreg
- renamable $tmm0 = PTDPBSSDV renamable $ax, renamable $cx, killed renamable $di, renamable $tmm0, killed renamable $tmm1, killed renamable $tmm2
- PTILESTOREDV killed renamable $ax, killed renamable $cx, killed renamable $rdx, 1, killed renamable $rsi, 0, $noreg, killed renamable $tmm0
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
deleted file mode 100644
index c7d241f..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_configure_O2.mir
+++ /dev/null
@@ -1,153 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=greedy,tileconfig -o - %s | FileCheck %s
-
---- |
- @buf = dso_local global [2048 x i8] zeroinitializer, align 16
- @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
- define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
- entry:
- %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf, i64 0, i64 0), i64 32) #5
- %1 = extractvalue { x86_amx, x86_amx } %0, 0
- %2 = extractvalue { x86_amx, x86_amx } %0, 1
- %3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
- %4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %3, x86_amx %1, x86_amx %2) #5
- tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, i8* getelementptr inbounds ([2048 x i8], [2048 x i8]* @buf2, i64 0, i64 0), i64 32, x86_amx %4) #5
- ret void
- }
-
- declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64) #1
-
- declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
- declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
- declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
- declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
- declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) #4
-
- attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
- attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #5 = { nounwind }
-
-...
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
- - { id: 0, class: gr32, preferred-register: '' }
- - { id: 1, class: gr32, preferred-register: '' }
- - { id: 2, class: gr32, preferred-register: '' }
- - { id: 3, class: gr16, preferred-register: '' }
- - { id: 4, class: gr16, preferred-register: '' }
- - { id: 5, class: gr16, preferred-register: '' }
- - { id: 6, class: gr64, preferred-register: '' }
- - { id: 7, class: gr64_nosp, preferred-register: '' }
- - { id: 8, class: tilepair, preferred-register: '' }
- - { id: 9, class: tile, preferred-register: '' }
- - { id: 10, class: tile, preferred-register: '' }
- - { id: 11, class: tile, preferred-register: '' }
- - { id: 12, class: tile, preferred-register: '' }
- - { id: 13, class: gr64, preferred-register: '' }
- - { id: 14, class: vr512, preferred-register: '' }
-liveins:
- - { reg: '$edi', virtual-reg: '%0' }
- - { reg: '$esi', virtual-reg: '%1' }
- - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 4
- adjustsStack: false
- hasCalls: false
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack:
- - { id: 0, name: '', type: default, offset: 0, size: 64, alignment: 4,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $edi, $esi, $edx
-
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $edi, $esi, $edx
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
- ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
- ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
- ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
- ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 26, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 26, align 2, basealign 4)
- ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 53, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 53, align 1, basealign 4)
- ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 24, $noreg, [[COPY1]].sub_16bit :: (store (s512) into %stack.0 + 24, align 4)
- ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 52, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 52, align 4)
- ; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, [[COPY]].sub_16bit :: (store (s512) into %stack.0 + 16, align 4)
- ; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[COPY2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4)
- ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
- ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 @buf
- ; CHECK-NEXT: [[MOV32ri64_1:%[0-9]+]]:gr64_nosp = MOV32ri64 32
- ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[COPY]].sub_16bit, [[MOV32ri64_]], 1, [[MOV32ri64_1]], 0, $noreg
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY2]].sub_16bit, [[COPY]].sub_16bit, [[COPY1]].sub_16bit, [[PTILEZEROV]], [[PT2RPNTLVWZ0V]].sub_t0, [[PT2RPNTLVWZ0V]].sub_t1
- ; CHECK-NEXT: [[MOV32ri64_2:%[0-9]+]]:gr64 = MOV32ri64 @buf2
- ; CHECK-NEXT: PTILESTOREDV [[COPY2]].sub_16bit, [[COPY1]].sub_16bit, [[MOV32ri64_2]], 1, [[MOV32ri64_1]], 0, $noreg, [[PTILEZEROV]]
- ; CHECK-NEXT: RET 0
- %2:gr32 = COPY $edx
- %1:gr32 = COPY $esi
- %0:gr32 = COPY $edi
- %14:vr512 = AVX512_512_SET0
- VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, %14 :: (store (s512) into %stack.0, align 4)
- MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
- PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
- %6:gr64 = MOV32ri64 @buf
- %7:gr64_nosp = MOV32ri64 32
- %8:tilepair = PT2RPNTLVWZ0V %0.sub_16bit, %1.sub_16bit, %2.sub_16bit, %6, 1, %7, 0, $noreg
- %12:tile = PTILEZEROV %0.sub_16bit, %1.sub_16bit
- %12:tile = PTDPBSSDV %0.sub_16bit, %2.sub_16bit, %1.sub_16bit, %12, %8.sub_t0, %8.sub_t1
- %13:gr64 = MOV32ri64 @buf2
- PTILESTOREDV %0.sub_16bit, %1.sub_16bit, %13, 1, %7, 0, $noreg, %12
- RET 0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir b/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
deleted file mode 100644
index 66b15aa..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_copy.mir
+++ /dev/null
@@ -1,97 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=lowertilecopy -o - %s | FileCheck %s
-
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers: []
-liveins:
- - { reg: '$edi', virtual-reg: '' }
- - { reg: '$esi', virtual-reg: '' }
- - { reg: '$edx', virtual-reg: '' }
- - { reg: '$cx', virtual-reg: '' }
- - { reg: '$r9', virtual-reg: '' }
- - { reg: '$r10', virtual-reg: '' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 1024
- adjustsStack: false
- hasCalls: true
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack:
- - { id: 43, name: '', type: default, offset: 0, size: 64, alignment: 4,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 68, name: '', type: spill-slot, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
-
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $edi, $esi, $edx, $cx, $di, $r8w, $r11, $r10, $rbx, $r8, $r9
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
- ; CHECK-NEXT: renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
- ; CHECK-NEXT: $rax = MOV64ri 64
- ; CHECK-NEXT: TILESTORED %stack.3, 1, $rax, 0, $noreg, $tmm5 :: (store (s8192) into %stack.3)
- ; CHECK-NEXT: $tmm0 = TILELOADD %stack.3, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.3)
- ; CHECK-NEXT: $rax = MOV64ri 64
- ; CHECK-NEXT: TILESTORED %stack.2, 1, $rax, 0, $noreg, $tmm4 :: (store (s8192) into %stack.2)
- ; CHECK-NEXT: $tmm1 = TILELOADD %stack.2, 1, killed $rax, 0, $noreg :: (load (s8192) from %stack.2)
- ; CHECK-NEXT: renamable $r8 = MOV32ri64 64
- ; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.1)
- ; CHECK-NEXT: renamable $di = MOV16ri 64
- ; CHECK-NEXT: renamable $cx = MOV16ri 16
- ; CHECK-NEXT: PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
- ; CHECK-NEXT: PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
- PLDTILECFGV %stack.43, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.43, align 4)
- renamable $tmm4_tmm5 = PT2RPNTLVWZ0V killed renamable $cx, killed renamable $di, killed renamable $r8w, killed renamable $r11, 1, killed renamable $rbx, 0, $noreg
- renamable $tmm0 = COPY renamable $tmm5
- renamable $tmm1 = COPY renamable $tmm4, implicit killed $tmm4_tmm5
- renamable $r8 = MOV32ri64 64
- MOV64mr %stack.68, 1, $noreg, 0, $noreg, $r8 :: (store (s64) into %stack.68)
- renamable $di = MOV16ri 64
- renamable $cx = MOV16ri 16
- PTILESTOREDV renamable $cx, renamable $di, killed renamable $r10, 1, renamable $r8, 0, $noreg, killed renamable $tmm1
- PTILESTOREDV killed renamable $cx, killed renamable $di, killed renamable $r9, 1, renamable $r8, 0, $noreg, killed renamable $tmm0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
deleted file mode 100644
index 3549875..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O0.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
- ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
- ; RUN: opt --codegen-opt-level=0 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
-
- @buf = dso_local global [2048 x i8] zeroinitializer, align 16
-
- ; Function Attrs: noinline nounwind optnone uwtable
- define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1, ptr %m) #0 {
-; CHECK-LABEL: @test_tile_2rpntlvwz0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = udiv i16 [[COL1:%.*]], 4
-; CHECK-NEXT: [[TMP1:%.*]] = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M:%.*]], i64 [[TMP3]], x86_amx [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP1]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[COL1]] to i64
-; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP6]], x86_amx [[TMP5]])
-; CHECK-NEXT: [[TMP8:%.*]] = call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
-; CHECK-NEXT: [[TMP9:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP9]], x86_amx [[TMP8]])
-; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP11]])
-; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[COL1]] to i64
-; CHECK-NEXT: [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[ROW]], i16 [[COL1]], ptr [[M]], i64 [[TMP14]])
-; CHECK-NEXT: [[TMP17:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP0]], i16 [[COL0]], ptr [[M]], i64 [[TMP17]])
-; CHECK-NEXT: [[TMP20:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL0]], i16 [[COL1]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]]) #[[ATTR3]]
-; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[COL0]] to i64
-; CHECK-NEXT: call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr [[M]], i64 [[TMP21]], x86_amx [[TMP20]])
-; CHECK-NEXT: ret void
-;
- entry:
-
- %0 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr getelementptr inbounds ([2048 x i8], ptr @buf, i64 0, i64 0), i64 32) #7
- %1 = extractvalue { x86_amx, x86_amx } %0, 0
- %2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #7
- store <256 x i32> %2, ptr %m, align 1024
-
- %3 = extractvalue { x86_amx, x86_amx } %0, 1
- %4 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #7
- store <256 x i32> %4, ptr %m, align 1024
-
- %5 = call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #7
- %6 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #7
- store <256 x i32> %6, ptr %m, align 64
-
- %7 = load <256 x i32>, ptr %m, align 64
- %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %7) #7
- %9 = load <256 x i32>, ptr %m, align 64
- %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9) #7
- %11 = load <256 x i32>, ptr %m, align 64
- %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #7
-
- %13 = call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col0, i16 %col1, x86_amx %8, x86_amx %10, x86_amx %12) #7
- %14 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %13) #7
- store <256 x i32> %14, ptr %m, align 64
-
- ret void
- }
-
- ; Function Attrs: argmemonly nounwind readonly
- declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #2
-
- ; Function Attrs: nounwind readnone
- declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #3
-
- ; Function Attrs: nounwind
- declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #4
-
- ; Function Attrs: nounwind
- declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #4
-
- ; Function Attrs: nounwind readnone
- declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #3
-
- ; Function Attrs: argmemonly nounwind writeonly
- declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #5
-
- attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
- attributes #1 = { argmemonly nofree nounwind willreturn writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #2 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #3 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #4 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #5 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #6 = { argmemonly nofree nounwind willreturn "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #7 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll b/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
deleted file mode 100644
index 96966264..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_lower_type_O2.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -x86-lower-amx-type %s -S | FileCheck %s
-; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -passes=x86-lower-amx-type %s -S | FileCheck %s
-
- @buf = dso_local global [2048 x i8] zeroinitializer, align 16
- @buf2 = dso_local global [2048 x i8] zeroinitializer, align 16
-
- ; Function Attrs: nounwind uwtable
- define dso_local void @test_tile_2rpntlvwz0(i16 noundef signext %row, i16 noundef signext %col0, i16 noundef signext %col1) local_unnamed_addr #0 {
-; CHECK-LABEL: @test_tile_2rpntlvwz0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 [[ROW:%.*]], i16 [[COL0:%.*]], i16 [[COL1:%.*]], ptr @buf, i64 32) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { x86_amx, x86_amx } [[TMP0]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = tail call x86_amx @llvm.x86.tilezero.internal(i16 [[ROW]], i16 [[COL0]]) #[[ATTR3]]
-; CHECK-NEXT: [[TMP4:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[ROW]], i16 [[COL1]], i16 [[COL0]], x86_amx [[TMP3]], x86_amx [[TMP1]], x86_amx [[TMP2]]) #[[ATTR3]]
-; CHECK-NEXT: tail call void @llvm.x86.tilestored64.internal(i16 [[ROW]], i16 [[COL0]], ptr @buf2, i64 32, x86_amx [[TMP4]]) #[[ATTR3]]
-; CHECK-NEXT: ret void
-;
- entry:
- %0 = tail call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 %row, i16 %col0, i16 %col1, ptr @buf, i64 32) #5
- %1 = extractvalue { x86_amx, x86_amx } %0, 0
- %2 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %1) #5
- %3 = extractvalue { x86_amx, x86_amx } %0, 1
- %4 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %3) #5
- %5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col0) #5
- %6 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %5) #5
- %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %6) #5
- %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %2) #5
- %9 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %4) #5
- %10 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col1, i16 %col0, x86_amx %7, x86_amx %8, x86_amx %9) #5
- %11 = tail call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10) #5
- %12 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11) #5
- tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col0, ptr @buf2, i64 32, x86_amx %12) #5
- ret void
- }
-
- ; Function Attrs: argmemonly nounwind readonly
- declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, ptr, i64) #1
-
- ; Function Attrs: nounwind readnone
- declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx) #2
-
- ; Function Attrs: nounwind
- declare x86_amx @llvm.x86.tilezero.internal(i16, i16) #3
-
- ; Function Attrs: nounwind
- declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) #3
-
- ; Function Attrs: nounwind readnone
- declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>) #2
-
- ; Function Attrs: argmemonly nounwind writeonly
- declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx) #4
-
- attributes #0 = { nounwind uwtable "frame-pointer"="all" "min-legal-vector-width"="8192" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+amx-bf16,+amx-int8,+amx-tile,+amx-transpose,+avx,+avx2,+avx512f,+crc32,+cx8,+f16c,+fma,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+amx-tile,+amx-bf16,+avx512f,+amx-transpose" "tune-cpu"="generic" }
- attributes #1 = { argmemonly nounwind readonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #2 = { nounwind readnone "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #3 = { nounwind "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #4 = { argmemonly nounwind writeonly "target-features"="+amx-tile,+amx-bf16,+avx512f,+amx-transpose" }
- attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
deleted file mode 100644
index 1e3b242..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O0.mir
+++ /dev/null
@@ -1,134 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=fastpretileconfig -o - %s | FileCheck %s
-
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
- - { id: 0, class: gr64_nosp, preferred-register: '' }
- - { id: 1, class: gr16, preferred-register: '' }
- - { id: 2, class: gr16, preferred-register: '' }
- - { id: 3, class: gr16, preferred-register: '' }
- - { id: 4, class: gr64, preferred-register: '' }
- - { id: 5, class: gr64, preferred-register: '' }
- - { id: 6, class: gr64, preferred-register: '' }
- - { id: 7, class: gr64_nosp, preferred-register: '' }
- - { id: 8, class: tilepair, preferred-register: '' }
- - { id: 9, class: tile, preferred-register: '' }
- - { id: 10, class: tile, preferred-register: '' }
- - { id: 11, class: tile, preferred-register: '' }
- - { id: 181, class: tile, preferred-register: '' }
- - { id: 183, class: tile, preferred-register: '' }
- - { id: 185, class: tile, preferred-register: '' }
- - { id: 186, class: tile, preferred-register: '' }
-liveins:
- - { reg: '$edi', virtual-reg: '%0' }
- - { reg: '$esi', virtual-reg: '%1' }
- - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 1024
- adjustsStack: false
- hasCalls: true
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack:
- - { id: 18, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 19, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 20, name: '', type: default, offset: 0, size: 8, alignment: 8,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
- - { id: 21, name: '', type: default, offset: 0, size: 8,
- alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $rdi, $rsi, $rdx, $rax
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $rdi, $rsi, $rdx, $rax
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
- ; CHECK-NEXT: VMOVUPSZmr %stack.4, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.4, align 4)
- ; CHECK-NEXT: MOV8mi %stack.4, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.4, align 4)
- ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 64
- ; CHECK-NEXT: [[MOV16ri:%[0-9]+]]:gr16 = MOV16ri 64
- ; CHECK-NEXT: [[MOV16ri1:%[0-9]+]]:gr16 = MOV16ri 16
- ; CHECK-NEXT: [[MOV16ri2:%[0-9]+]]:gr16 = MOV16ri 16
- ; CHECK-NEXT: PLDTILECFGV %stack.4, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.4, align 4)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY $rdx
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64_nosp = COPY $rax
- ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[COPY2]], 1, killed [[COPY3]], 0, $noreg
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
- ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY5]]
- ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg, killed [[COPY4]]
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV16ri]], [[MOV16ri1]]
- ; CHECK-NEXT: PTILESTOREDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTILEZEROV]]
- ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY]], 1, [[MOV32ri64_]], 0, $noreg
- ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri2]], [[COPY1]], 1, [[MOV32ri64_]], 0, $noreg
- ; CHECK-NEXT: [[PTILELOADDV2:%[0-9]+]]:tile = PTILELOADDV [[MOV16ri]], [[MOV16ri1]], [[COPY2]], 1, [[MOV32ri64_]], 0, $noreg
- ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[MOV16ri]], [[MOV16ri1]], [[MOV16ri2]], [[PTILELOADDV]], killed [[PTILELOADDV1]], killed [[PTILELOADDV2]]
- ; CHECK-NEXT: PTILESTOREDV killed [[MOV16ri]], killed [[MOV16ri1]], killed [[COPY2]], 1, killed [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
- %0:gr64_nosp = MOV32ri64 64
- %1:gr16 = MOV16ri 64
- %2:gr16 = MOV16ri 16
- %3:gr16 = MOV16ri 16
- %4:gr64 = COPY $rsi
- %5:gr64 = COPY $rdi
- %6:gr64 = COPY $rdx
- %7:gr64_nosp = COPY $rax
- %8:tilepair = PT2RPNTLVWZ0V %1, %2, %3, %6, 1, killed %7, 0, $noreg
- %9:tile = COPY %8.sub_t1
- %10:tile = COPY %8.sub_t0
- PTILESTOREDV %1, %2, %4, 1, %0, 0, $noreg, killed %10
- PTILESTOREDV %1, %3, %5, 1, %0, 0, $noreg, killed %9
- %11:tile = PTILEZEROV %1, %2
- PTILESTOREDV %1, %2, %6, 1, %0, 0, $noreg, killed %11
- %181:tile = PTILELOADDV %1, %2, %4, 1, %0, 0, $noreg
- %183:tile = PTILELOADDV %1, %3, %5, 1, %0, 0, $noreg
- %185:tile = PTILELOADDV %1, %2, %6, 1, %0, 0, $noreg
- %186:tile = PTDPBSSDV %1, %2, %3, %181, killed %183, killed %185
- PTILESTOREDV killed %1, killed %2, killed %6, 1, killed %0, 0, $noreg, killed %186
-...
diff --git a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir b/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
deleted file mode 100644
index ac2cdb4..0000000
--- a/llvm/test/CodeGen/X86/amx_tile_pair_preconfigure_O2.mir
+++ /dev/null
@@ -1,113 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -O2 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-bf16,+avx512f, \
-# RUN: -mattr=+amx-transpose -run-pass=tilepreconfig -o - %s | FileCheck %s
-
----
-name: test_tile_2rpntlvwz0
-alignment: 16
-exposesReturnsTwice: false
-legalized: false
-regBankSelected: false
-selected: false
-failedISel: false
-tracksRegLiveness: true
-hasWinCFI: false
-callsEHReturn: false
-callsUnwindInit: false
-hasEHContTarget: false
-hasEHScopes: false
-hasEHFunclets: false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
- - { id: 0, class: gr32, preferred-register: '' }
- - { id: 1, class: gr32, preferred-register: '' }
- - { id: 2, class: gr32, preferred-register: '' }
- - { id: 3, class: gr16, preferred-register: '' }
- - { id: 4, class: gr16, preferred-register: '' }
- - { id: 5, class: gr16, preferred-register: '' }
- - { id: 6, class: gr64, preferred-register: '' }
- - { id: 7, class: gr64_nosp, preferred-register: '' }
- - { id: 8, class: tilepair, preferred-register: '' }
- - { id: 9, class: tile, preferred-register: '' }
- - { id: 10, class: tile, preferred-register: '' }
- - { id: 11, class: tile, preferred-register: '' }
- - { id: 12, class: tile, preferred-register: '' }
- - { id: 13, class: gr64, preferred-register: '' }
-liveins:
- - { reg: '$edi', virtual-reg: '%0' }
- - { reg: '$esi', virtual-reg: '%1' }
- - { reg: '$edx', virtual-reg: '%2' }
-frameInfo:
- isFrameAddressTaken: false
- isReturnAddressTaken: false
- hasStackMap: false
- hasPatchPoint: false
- stackSize: 0
- offsetAdjustment: 0
- maxAlignment: 1
- adjustsStack: false
- hasCalls: false
- stackProtector: ''
- functionContext: ''
- maxCallFrameSize: 4294967295
- cvBytesOfCalleeSavedRegisters: 0
- hasOpaqueSPAdjustment: false
- hasVAStart: false
- hasMustTailInVarArgFunc: false
- hasTailCall: false
- localFrameSize: 0
- savePoint: []
- restorePoint: []
-fixedStack: []
-stack: []
-callSites: []
-debugValueSubstitutions: []
-constants: []
-machineFunctionInfo:
- amxProgModel: ManagedRA
-body: |
- bb.0.entry:
- liveins: $edi, $esi, $edx, $rax, $rbx
-
- ; CHECK-LABEL: name: test_tile_2rpntlvwz0
- ; CHECK: liveins: $edi, $esi, $edx, $rax, $rbx
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
- ; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
- ; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY $edi
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr16 = COPY [[COPY1]].sub_16bit
- ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr16 = COPY [[COPY2]].sub_16bit
- ; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def $tmm0, implicit-def $tmm1, implicit-def $tmm2, implicit-def $tmm3, implicit-def $tmm4, implicit-def $tmm5, implicit-def $tmm6, implicit-def $tmm7 :: (load (s512) from %stack.0, align 4)
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY $rax
- ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_nosp = MOV32ri64 32
- ; CHECK-NEXT: [[PT2RPNTLVWZ0V:%[0-9]+]]:tilepair = PT2RPNTLVWZ0V [[COPY5]], [[COPY4]], [[COPY3]], killed [[COPY6]], 1, [[MOV32ri64_]], 0, $noreg
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t1
- ; CHECK-NEXT: [[COPY8:%[0-9]+]]:tile = COPY [[PT2RPNTLVWZ0V]].sub_t0
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY5]], [[COPY4]]
- ; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY5]], [[COPY3]], [[COPY4]], [[PTILEZEROV]], killed [[COPY8]], killed [[COPY7]]
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY $rbx
- ; CHECK-NEXT: PTILESTOREDV [[COPY5]], [[COPY4]], killed [[COPY9]], 1, [[MOV32ri64_]], 0, $noreg, killed [[PTDPBSSDV]]
- ; CHECK-NEXT: RET 0
- %2:gr32 = COPY $edx
- %1:gr32 = COPY $esi
- %0:gr32 = COPY $edi
- %3:gr16 = COPY %2.sub_16bit
- %4:gr16 = COPY %1.sub_16bit
- %5:gr16 = COPY %0.sub_16bit
- %6:gr64 = COPY $rax
- %7:gr64_nosp = MOV32ri64 32
- %8:tilepair = PT2RPNTLVWZ0V %5, %4, %3, killed %6, 1, %7, 0, $noreg
- %9:tile = COPY %8.sub_t1
- %10:tile = COPY %8.sub_t0
- %11:tile = PTILEZEROV %5, %4
- %12:tile = PTDPBSSDV %5, %3, %4, %11, killed %10, killed %9
- %13:gr64 = COPY $rbx
- PTILESTOREDV %5, %4, killed %13, 1, %7, 0, $noreg, killed %12
- RET 0
-
-...
diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
deleted file mode 100644
index 4cfd97a..0000000
--- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll
+++ /dev/null
@@ -1,371 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+amx-bf16,+amx-fp16,+amx-complex,+amx-transpose,+egpr --show-mc-encoding | FileCheck %s --check-prefix=EGPR
-
-define void @test_amx(i32 %rv32, i64 %stride, i64 %rvalue, i8* %addr1, <4 x float> %xmm) #0 {
-; CHECK-LABEL: test_amx:
-; CHECK: # %bb.0:
-; CHECK-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0
-; CHECK-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2
-; CHECK-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0
-; CHECK-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2
-; CHECK-NEXT: ttransposed %tmm3, %tmm1
-; CHECK-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4
-; CHECK-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1
-; CHECK-NEXT: tconjtfp16 %tmm2, %tmm1
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx:
-; EGPR: # %bb.0:
-; EGPR-NEXT: t2rpntlvwz0 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x04,0x31]
-; EGPR-NEXT: t2rpntlvwz0t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x14,0x31]
-; EGPR-NEXT: t2rpntlvwz1 (%rcx,%rsi), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x04,0x31]
-; EGPR-NEXT: t2rpntlvwz1t1 (%rcx,%rsi), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x14,0x31]
-; EGPR-NEXT: ttransposed %tmm3, %tmm1 # encoding: [0xc4,0xe2,0x7a,0x5f,0xcb]
-; EGPR-NEXT: ttdpbf16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6c,0xca]
-; EGPR-NEXT: ttdpfp16ps %tmm6, %tmm5, %tmm4 # encoding: [0xc4,0xe2,0x4b,0x6c,0xe5]
-; EGPR-NEXT: ttcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x63,0x6b,0xca]
-; EGPR-NEXT: ttcmmrlfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x62,0x6b,0xca]
-; EGPR-NEXT: tconjtcmmimfp16ps %tmm3, %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x60,0x6b,0xca]
-; EGPR-NEXT: tconjtfp16 %tmm2, %tmm1 # encoding: [0xc4,0xe2,0x79,0x6b,0xca]
-; EGPR-NEXT: retq # encoding: [0xc3]
- call void @llvm.x86.t2rpntlvwz0(i8 1, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz0t1(i8 2, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz1(i8 1, i8* %addr1, i64 %stride)
- call void @llvm.x86.t2rpntlvwz1t1(i8 2, i8* %addr1, i64 %stride)
- call void @llvm.x86.ttransposed(i8 1, i8 3)
- call void @llvm.x86.ttdpbf16ps(i8 1, i8 2, i8 3)
- call void @llvm.x86.ttdpfp16ps(i8 4, i8 5, i8 6)
- call void @llvm.x86.ttcmmimfp16ps(i8 1, i8 2, i8 3)
- call void @llvm.x86.ttcmmrlfp16ps(i8 1, i8 2, i8 3)
- call void @llvm.x86.tconjtcmmimfp16ps(i8 1, i8 2, i8 3)
- call void @llvm.x86.tconjtfp16(i8 1, i8 2)
- ret void
-}
-
-declare void @llvm.x86.t2rpntlvwz0(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz0t1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.t2rpntlvwz1t1(i8 %tile1, i8* %addr1, i64 %stride)
-declare void @llvm.x86.ttransposed(i8 %tile0, i8 %tile1)
-declare void @llvm.x86.ttdpbf16ps(i8 %tile0, i8 %tile1, i8 %tile2)
-declare void @llvm.x86.ttdpfp16ps(i8 %tile0, i8 %tile1, i8 %tile2)
-declare void @llvm.x86.ttcmmimfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.ttcmmrlfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.tconjtcmmimfp16ps(i8 %A, i8 %B, i8 %C)
-declare void @llvm.x86.tconjtfp16(i8 %A, i8 %B)
-
-define void @test_amx2(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rbp
-; CHECK-NEXT: subq $2928, %rsp # imm = 0xB70
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, %ax
-; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0
-; CHECK-NEXT: tilezero %tmm1
-; CHECK-NEXT: tilezero %tmm2
-; CHECK-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2
-; CHECK-NEXT: movabsq $64, %rbp
-; CHECK-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
-; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
-; CHECK-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3
-; CHECK-NEXT: tconjtfp16 %tmm3, %tmm0
-; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx)
-; CHECK-NEXT: addq $2928, %rsp # imm = 0xB70
-; CHECK-NEXT: popq %rbp
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx2:
-; EGPR: # %bb.0:
-; EGPR-NEXT: pushq %rbp # encoding: [0x55]
-; EGPR-NEXT: subq $2928, %rsp # encoding: [0x48,0x81,0xec,0x70,0x0b,0x00,0x00]
-; EGPR-NEXT: # imm = 0xB70
-; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0d]
-; EGPR-NEXT: movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x40,0x03,0x00,0x00,0x01]
-; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x70,0x03,0x00,0x00,0x08]
-; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x50,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x71,0x03,0x00,0x00,0x08]
-; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x52,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x72,0x03,0x00,0x00,0x08]
-; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x54,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT: movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0x73,0x03,0x00,0x00,0x08]
-; EGPR-NEXT: movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0x56,0x03,0x00,0x00,0x08,0x00]
-; EGPR-NEXT: ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0x40,0x03,0x00,0x00]
-; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
-; EGPR-NEXT: tilezero %tmm1 # encoding: [0xc4,0xe2,0x7b,0x49,0xc8]
-; EGPR-NEXT: tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0]
-; EGPR-NEXT: ttdpbf16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6c,0xd0]
-; EGPR-NEXT: ttdpfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6c,0xd0]
-; EGPR-NEXT: ttcmmimfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x6b,0xd0]
-; EGPR-NEXT: ttcmmrlfp16ps %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x72,0x6b,0xd0]
-; EGPR-NEXT: movabsq $64, %rbp # encoding: [0x48,0xbd,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm2, 896(%rsp,%rbp) # 1024-byte Folded Spill
-; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x94,0x2c,0x80,0x03,0x00,0x00]
-; EGPR-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
-; EGPR-NEXT: # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x2c,0x80,0x03,0x00,0x00]
-; EGPR-NEXT: tconjtcmmimfp16ps %tmm1, %tmm0, %tmm3 # encoding: [0xc4,0xe2,0x70,0x6b,0xd8]
-; EGPR-NEXT: tconjtfp16 %tmm3, %tmm0 # encoding: [0xc4,0xe2,0x79,0x6b,0xc3]
-; EGPR-NEXT: tilestored %tmm2, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x14,0x17]
-; EGPR-NEXT: addq $2928, %rsp # encoding: [0x48,0x81,0xc4,0x70,0x0b,0x00,0x00]
-; EGPR-NEXT: # imm = 0xB70
-; EGPR-NEXT: popq %rbp # encoding: [0x5d]
-; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT: retq # encoding: [0xc3]
-
- %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
- %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
- %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
- %c1 = call x86_amx @llvm.x86.ttdpbf16ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b)
- %c2 = call x86_amx @llvm.x86.ttdpfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b)
- %c3 = call x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c2, x86_amx %a, x86_amx %b)
- %c4 = call x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c3, x86_amx %a, x86_amx %b)
- %c5 = call x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16 8, i16 8, i16 8, x86_amx %c4, x86_amx %a, x86_amx %b)
- %c6 = call x86_amx @llvm.x86.tconjtfp16.internal(i16 8, i16 8, x86_amx %c5)
-
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c4)
- ret void
-}
-
-define void @test_amx3(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx3:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movw $8, %cx
-; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: ttransposed %tmm4, %tmm0
-; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx)
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx3:
-; EGPR: # %bb.0:
-; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xff]
-; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xc0,0x01]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf0,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd0,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf4,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xd8,0x08,0x00]
-; EGPR-NEXT: movb $0, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xf5,0x00]
-; EGPR-NEXT: movw $0, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0xda,0x00,0x00]
-; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0xc0]
-; EGPR-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0]
-; EGPR-NEXT: movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00]
-; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
-; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x24,0x16]
-; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x24,0x16]
-; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x24,0x16]
-; EGPR-NEXT: ttransposed %tmm4, %tmm0 # encoding: [0xc4,0xe2,0x7a,0x5f,0xc4]
-; EGPR-NEXT: tilestored %tmm0, (%rdi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x04,0x17]
-; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT: retq # encoding: [0xc3]
- %1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
- %2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
- %3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
- %4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 0, i8* %base, i64 %stride)
- %5 = extractvalue { x86_amx, x86_amx } %4, 0
- %6 = call x86_amx @llvm.x86.ttransposed.internal(i16 8, i16 8, x86_amx %5)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %6)
- ret void
-}
-
-define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 {
-; CHECK-LABEL: test_amx_spill:
-; CHECK: # %bb.0:
-; CHECK-NEXT: subq $6088, %rsp # imm = 0x17C8
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
-; CHECK-NEXT: movw $8, %ax
-; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0
-; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4
-; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT: movabsq $64, %rcx
-; CHECK-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6
-; CHECK-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
-; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6
-; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; CHECK-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm6, (%rsi,%rdx)
-; CHECK-NEXT: tilestored %tmm7, (%rsi,%rdx)
-; CHECK-NEXT: addq $6088, %rsp # imm = 0x17C8
-; CHECK-NEXT: tilerelease
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-;
-; EGPR-LABEL: test_amx_spill:
-; EGPR: # %bb.0:
-; EGPR-NEXT: subq $6088, %rsp # encoding: [0x48,0x81,0xec,0xc8,0x17,0x00,0x00]
-; EGPR-NEXT: # imm = 0x17C8
-; EGPR-NEXT: vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
-; EGPR-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0xfe]
-; EGPR-NEXT: movb $1, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x80,0x01]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb0,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x90,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb4,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x98,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb5,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9a,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb6,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9c,0x08,0x00]
-; EGPR-NEXT: movb $8, -{{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0xb7,0x08]
-; EGPR-NEXT: movw $8, -{{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x9e,0x08,0x00]
-; EGPR-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x80]
-; EGPR-NEXT: movw $8, %ax # encoding: [0x66,0xb8,0x08,0x00]
-; EGPR-NEXT: tileloadd (%rsi,%rdx), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x04,0x16]
-; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x24,0x16]
-; EGPR-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6f,0x34,0x16]
-; EGPR-NEXT: movabsq $64, %rcx # encoding: [0x48,0xb9,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x0f,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x13,0x00,0x00]
-; EGPR-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6e,0x34,0x16]
-; EGPR-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xb4,0x0c,0xc0,0x07,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x0b,0x00,0x00]
-; EGPR-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x6f,0x34,0x16]
-; EGPR-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0x74,0x0c,0xc0]
-; EGPR-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7a,0x4b,0xbc,0x0c,0xc0,0x03,0x00,0x00]
-; EGPR-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x6e,0x34,0x16]
-; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x0f,0x00,0x00]
-; EGPR-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x13,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT: tileloadd 1984(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xa4,0x0c,0xc0,0x07,0x00,0x00]
-; EGPR-NEXT: tileloadd 3008(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x0b,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT: tileloadd -64(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0x64,0x0c,0xc0]
-; EGPR-NEXT: tileloadd 960(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload
-; EGPR-NEXT: # encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x0c,0xc0,0x03,0x00,0x00]
-; EGPR-NEXT: tilestored %tmm4, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x24,0x16]
-; EGPR-NEXT: tilestored %tmm5, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x16]
-; EGPR-NEXT: tilestored %tmm6, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x34,0x16]
-; EGPR-NEXT: tilestored %tmm7, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x3c,0x16]
-; EGPR-NEXT: addq $6088, %rsp # encoding: [0x48,0x81,0xc4,0xc8,0x17,0x00,0x00]
-; EGPR-NEXT: # imm = 0x17C8
-; EGPR-NEXT: tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
-; EGPR-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
-; EGPR-NEXT: retq # encoding: [0xc3]
- %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride)
- %b1 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %b2 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %b3 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %b4 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %b5 = call { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16 8, i16 8, i16 8, i8* %base, i64 %stride)
- %e11 = extractvalue { x86_amx, x86_amx } %b1, 0
- %e12 = extractvalue { x86_amx, x86_amx } %b1, 1
- %e21 = extractvalue { x86_amx, x86_amx } %b2, 0
- %e22 = extractvalue { x86_amx, x86_amx } %b2, 1
- %e31 = extractvalue { x86_amx, x86_amx } %b3, 0
- %e32 = extractvalue { x86_amx, x86_amx } %b3, 1
- %e41 = extractvalue { x86_amx, x86_amx } %b4, 0
- %e42 = extractvalue { x86_amx, x86_amx } %b4, 1
- %e51 = extractvalue { x86_amx, x86_amx } %b5, 0
- %e52 = extractvalue { x86_amx, x86_amx } %b5, 1
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e11)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e12)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e21)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e22)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e31)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e32)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e41)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e42)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e51)
- call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %base, i64 %stride, x86_amx %e52)
- ret void
-}
-
-declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
-declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz0t1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1.internal(i16, i16, i16, i8*, i64)
-declare { x86_amx, x86_amx } @llvm.x86.t2rpntlvwz1t1.internal(i16, i16, i16, i8*, i64)
-declare x86_amx @llvm.x86.ttransposed.internal(i16, i16, x86_amx)
-declare x86_amx @llvm.x86.ttdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttdpfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.ttcmmrlfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.tconjtcmmimfp16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
-declare x86_amx @llvm.x86.tconjtfp16.internal(i16, i16, x86_amx)
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/apx/compress-evex.mir b/llvm/test/CodeGen/X86/apx/compress-evex.mir
index c0ecfac0..e0873d3 100644
--- a/llvm/test/CodeGen/X86/apx/compress-evex.mir
+++ b/llvm/test/CodeGen/X86/apx/compress-evex.mir
@@ -139,3 +139,11 @@ body: |
$ax = XOR16rr_ND $ax, killed $di, implicit-def dead $eflags
RET64 $rax
...
+---
+name: setzuccm_2_setccm
+body: |
+ bb.0.entry:
+ liveins: $eflags
+ ; CHECK: sete 7(%rsp) # EVEX TO LEGACY Compression encoding: [0x0f,0x94,0x44,0x24,0x07]
+ SETZUCCm $rsp, 1, $noreg, 7, $noreg, 4, implicit killed $eflags
+...
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
index 805fc7c..6f31aef 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-general.ll
@@ -1,76 +1,80 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+sse2,+ssse3,+egpr,+avx | FileCheck %s --check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,+egpr,+avx --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,AVX
define i32 @map0(ptr nocapture noundef readonly %a, i64 noundef %b) {
- ; SSE-LABEL: name: map0
- ; SSE: bb.0.entry:
- ; SSE-NEXT: liveins: $rdi, $rsi
- ; SSE-NEXT: {{ $}}
- ; SSE-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi
- ; SSE-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
- ; SSE-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr)
- ; SSE-NEXT: $eax = COPY [[MOV32rm]]
- ; SSE-NEXT: RET 0, $eax
- ; AVX-LABEL: name: map0
- ; AVX: bb.0.entry:
- ; AVX-NEXT: liveins: $rdi, $rsi
- ; AVX-NEXT: {{ $}}
- ; AVX-NEXT: [[COPY:%[0-9]+]]:gr64_nosp = COPY $rsi
- ; AVX-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
- ; AVX-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s32) from %ir.add.ptr)
- ; AVX-NEXT: $eax = COPY [[MOV32rm]]
- ; AVX-NEXT: RET 0, $eax
+; CHECK-LABEL: map0:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movq %rsi, %r16 # encoding: [0xd5,0x18,0x89,0xf0]
+; CHECK-NEXT: movq %rdi, %r17 # encoding: [0xd5,0x18,0x89,0xf9]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl (%r17,%r16,4), %eax # encoding: [0xd5,0x30,0x8b,0x04,0x81]
+; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%add.ptr = getelementptr inbounds i32, ptr %a, i64 %b
+ tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
%0 = load i32, ptr %add.ptr
ret i32 %0
}
-define i32 @map1_or_vex(<2 x double> noundef %a) {
- ; SSE-LABEL: name: map1_or_vex
- ; SSE: bb.0.entry:
- ; SSE-NEXT: liveins: $xmm0
- ; SSE-NEXT: {{ $}}
- ; SSE-NEXT: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0
- ; SSE-NEXT: [[CVTSD2SIrr_Int:%[0-9]+]]:gr32 = nofpexcept CVTSD2SIrr_Int [[COPY]], implicit $mxcsr
- ; SSE-NEXT: $eax = COPY [[CVTSD2SIrr_Int]]
- ; SSE-NEXT: RET 0, $eax
- ; AVX-LABEL: name: map1_or_vex
- ; AVX: bb.0.entry:
- ; AVX-NEXT: liveins: $xmm0
- ; AVX-NEXT: {{ $}}
- ; AVX-NEXT: [[COPY:%[0-9]+]]:vr128 = COPY $xmm0
- ; AVX-NEXT: [[VCVTSD2SIrr_Int:%[0-9]+]]:gr32_norex2 = nofpexcept VCVTSD2SIrr_Int [[COPY]], implicit $mxcsr
- ; AVX-NEXT: $eax = COPY [[VCVTSD2SIrr_Int]]
- ; AVX-NEXT: RET 0, $eax
+define i32 @map1_or_vex(<2 x double> noundef %a) nounwind {
+; SSE-LABEL: map1_or_vex:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: cvtsd2si %xmm0, %r16d # encoding: [0xf2,0xd5,0xc0,0x2d,0xc0]
+; SSE-NEXT: #APP
+; SSE-NEXT: nop # encoding: [0x90]
+; SSE-NEXT: #NO_APP
+; SSE-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; SSE-NEXT: retq # encoding: [0xc3]
+;
+; AVX-LABEL: map1_or_vex:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: pushq %rbx # encoding: [0x53]
+; AVX-NEXT: vcvtsd2si %xmm0, %ebx # encoding: [0xc5,0xfb,0x2d,0xd8]
+; AVX-NEXT: #APP
+; AVX-NEXT: nop # encoding: [0x90]
+; AVX-NEXT: #NO_APP
+; AVX-NEXT: movl %ebx, %eax # encoding: [0x89,0xd8]
+; AVX-NEXT: popq %rbx # encoding: [0x5b]
+; AVX-NEXT: retq # encoding: [0xc3]
entry:
%0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a)
+ tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
ret i32 %0
}
-define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) {
- ; SSE-LABEL: name: map2_or_vex
- ; SSE: bb.0.entry:
- ; SSE-NEXT: liveins: $rdi, $rsi
- ; SSE-NEXT: {{ $}}
- ; SSE-NEXT: [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi
- ; SSE-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi
- ; SSE-NEXT: [[PABSBrm:%[0-9]+]]:vr128 = PABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr)
- ; SSE-NEXT: $xmm0 = COPY [[PABSBrm]]
- ; SSE-NEXT: RET 0, $xmm0
- ; AVX-LABEL: name: map2_or_vex
- ; AVX: bb.0.entry:
- ; AVX-NEXT: liveins: $rdi, $rsi
- ; AVX-NEXT: {{ $}}
- ; AVX-NEXT: [[COPY:%[0-9]+]]:gr64_norex2_nosp = COPY $rsi
- ; AVX-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi
- ; AVX-NEXT: [[VPABSBrm:%[0-9]+]]:vr128 = VPABSBrm [[COPY1]], 4, [[COPY]], 0, $noreg :: (load (s128) from %ir.add.ptr)
- ; AVX-NEXT: $xmm0 = COPY [[VPABSBrm]]
- ; AVX-NEXT: RET 0, $xmm0
+define <2 x i64> @map2_or_vex(ptr nocapture noundef readonly %b, i64 noundef %c) nounwind {
+; SSE-LABEL: map2_or_vex:
+; SSE: # %bb.0: # %entry
+; SSE-NEXT: pushq %r14 # encoding: [0x41,0x56]
+; SSE-NEXT: pushq %rbx # encoding: [0x53]
+; SSE-NEXT: movq %rsi, %rbx # encoding: [0x48,0x89,0xf3]
+; SSE-NEXT: movq %rdi, %r14 # encoding: [0x49,0x89,0xfe]
+; SSE-NEXT: #APP
+; SSE-NEXT: nop # encoding: [0x90]
+; SSE-NEXT: #NO_APP
+; SSE-NEXT: pabsb (%r14,%rbx,4), %xmm0 # encoding: [0x66,0x41,0x0f,0x38,0x1c,0x04,0x9e]
+; SSE-NEXT: popq %rbx # encoding: [0x5b]
+; SSE-NEXT: popq %r14 # encoding: [0x41,0x5e]
+; SSE-NEXT: retq # encoding: [0xc3]
+;
+; AVX-LABEL: map2_or_vex:
+; AVX: # %bb.0: # %entry
+; AVX-NEXT: pushq %r14 # encoding: [0x41,0x56]
+; AVX-NEXT: pushq %rbx # encoding: [0x53]
+; AVX-NEXT: movq %rsi, %rbx # encoding: [0x48,0x89,0xf3]
+; AVX-NEXT: movq %rdi, %r14 # encoding: [0x49,0x89,0xfe]
+; AVX-NEXT: #APP
+; AVX-NEXT: nop # encoding: [0x90]
+; AVX-NEXT: #NO_APP
+; AVX-NEXT: vpabsb (%r14,%rbx,4), %xmm0 # encoding: [0xc4,0xc2,0x79,0x1c,0x04,0x9e]
+; AVX-NEXT: popq %rbx # encoding: [0x5b]
+; AVX-NEXT: popq %r14 # encoding: [0x41,0x5e]
+; AVX-NEXT: retq # encoding: [0xc3]
entry:
+ tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
%add.ptr = getelementptr inbounds i32, ptr %b, i64 %c
%a = load <2 x i64>, ptr %add.ptr
%0 = bitcast <2 x i64> %a to <16 x i8>
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
index 5fa4cb4..a6ab98f 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-amx.ll
@@ -1,17 +1,20 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+amx-tile,+egpr | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+amx-tile,+egpr --show-mc-encoding | FileCheck %s
-define dso_local void @amx(ptr noundef %data) {
- ; CHECK-LABEL: name: amx
- ; CHECK: bb.0.entry:
- ; CHECK-NEXT: liveins: $rdi
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rdi
- ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64_norex2_nosp = MOV32ri64 8
- ; CHECK-NEXT: PTILELOADD 4, [[COPY]], 1, killed [[MOV32ri64_]], 0, $noreg
- ; CHECK-NEXT: RET 0
- entry:
+define dso_local void @amx(ptr noundef %data) nounwind {
+; CHECK-LABEL: amx:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbx # encoding: [0x53]
+; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl $8, %eax # encoding: [0xb8,0x08,0x00,0x00,0x00]
+; CHECK-NEXT: tileloadd (%rbx,%rax), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x24,0x03]
+; CHECK-NEXT: popq %rbx # encoding: [0x5b]
+; CHECK-NEXT: retq # encoding: [0xc3]
+entry:
+ tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
call void @llvm.x86.tileloadd64(i8 4, ptr %data, i64 8)
ret void
}
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
index a9ca591..e7bc0c3 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-pseudo-x87.ll
@@ -1,17 +1,22 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=-sse,+egpr | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=-sse,+egpr --show-mc-encoding | FileCheck %s
-define void @x87(ptr %0, ptr %1) {
- ; CHECK-LABEL: name: x87
- ; CHECK: bb.0 (%ir-block.2):
- ; CHECK-NEXT: liveins: $rdi, $rsi
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64_norex2 = COPY $rsi
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64_norex2 = COPY $rdi
- ; CHECK-NEXT: [[LD_Fp32m:%[0-9]+]]:rfp32 = nofpexcept LD_Fp32m [[COPY1]], 1, $noreg, 0, $noreg, implicit-def dead $fpsw, implicit $fpcw :: (load (s32) from %ir.0)
- ; CHECK-NEXT: nofpexcept ST_Fp32m [[COPY]], 1, $noreg, 0, $noreg, killed [[LD_Fp32m]], implicit-def dead $fpsw, implicit $fpcw :: (store (s32) into %ir.1)
- ; CHECK-NEXT: RET 0
+define void @x87(ptr %0, ptr %1) nounwind {
+; CHECK-LABEL: x87:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %r14 # encoding: [0x41,0x56]
+; CHECK-NEXT: pushq %rbx # encoding: [0x53]
+; CHECK-NEXT: movq %rsi, %rbx # encoding: [0x48,0x89,0xf3]
+; CHECK-NEXT: movq %rdi, %r14 # encoding: [0x49,0x89,0xfe]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: flds (%r14) # encoding: [0x41,0xd9,0x06]
+; CHECK-NEXT: fstps (%rbx) # encoding: [0xd9,0x1b]
+; CHECK-NEXT: popq %rbx # encoding: [0x5b]
+; CHECK-NEXT: popq %r14 # encoding: [0x41,0x5e]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ tail call void asm sideeffect "nop", "~{eax},~{ecx},~{edx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
%3 = load float, ptr %0
store float %3, ptr %1
ret void
diff --git a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
index 8653442..9b89bce 100644
--- a/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
+++ b/llvm/test/CodeGen/X86/apx/no-rex2-special.ll
@@ -1,70 +1,81 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr | FileCheck %s
-; RUN: llc < %s -enable-new-pm -mtriple=x86_64-unknown -stop-after=x86-isel -mattr=+xsave,+egpr | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+xsave,+egpr --show-mc-encoding | FileCheck %s
-define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) {
- ; CHECK-LABEL: name: test_xsave
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $rdi, $esi, $edx
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
- ; CHECK-NEXT: $edx = COPY [[COPY1]]
- ; CHECK-NEXT: $eax = COPY [[COPY]]
- ; CHECK-NEXT: XSAVE [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
- ; CHECK-NEXT: RET 0
+define void @test_xsave(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xsave:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbx # encoding: [0x53]
+; CHECK-NEXT: movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT: movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT: xsave (%rbx) # encoding: [0x0f,0xae,0x23]
+; CHECK-NEXT: popq %rbx # encoding: [0x5b]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
call void @llvm.x86.xsave(ptr %ptr, i32 %hi, i32 %lo)
ret void;
}
declare void @llvm.x86.xsave(ptr, i32, i32)
-define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) {
- ; CHECK-LABEL: name: test_xsave64
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $rdi, $esi, $edx
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
- ; CHECK-NEXT: $edx = COPY [[COPY1]]
- ; CHECK-NEXT: $eax = COPY [[COPY]]
- ; CHECK-NEXT: XSAVE64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
- ; CHECK-NEXT: RET 0
+define void @test_xsave64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xsave64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbx # encoding: [0x53]
+; CHECK-NEXT: movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT: movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT: xsave64 (%rbx) # encoding: [0x48,0x0f,0xae,0x23]
+; CHECK-NEXT: popq %rbx # encoding: [0x5b]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
call void @llvm.x86.xsave64(ptr %ptr, i32 %hi, i32 %lo)
ret void;
}
declare void @llvm.x86.xsave64(ptr, i32, i32)
-define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) {
- ; CHECK-LABEL: name: test_xrstor
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $rdi, $esi, $edx
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
- ; CHECK-NEXT: $edx = COPY [[COPY1]]
- ; CHECK-NEXT: $eax = COPY [[COPY]]
- ; CHECK-NEXT: XRSTOR [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
- ; CHECK-NEXT: RET 0
+define void @test_xrstor(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xrstor:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbx # encoding: [0x53]
+; CHECK-NEXT: movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT: movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT: xrstor (%rbx) # encoding: [0x0f,0xae,0x2b]
+; CHECK-NEXT: popq %rbx # encoding: [0x5b]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
call void @llvm.x86.xrstor(ptr %ptr, i32 %hi, i32 %lo)
ret void;
}
declare void @llvm.x86.xrstor(ptr, i32, i32)
-define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) {
- ; CHECK-LABEL: name: test_xrstor64
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $rdi, $esi, $edx
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $esi
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_norex2 = COPY $rdi
- ; CHECK-NEXT: $edx = COPY [[COPY1]]
- ; CHECK-NEXT: $eax = COPY [[COPY]]
- ; CHECK-NEXT: XRSTOR64 [[COPY2]], 1, $noreg, 0, $noreg, implicit $edx, implicit $eax
- ; CHECK-NEXT: RET 0
+define void @test_xrstor64(ptr %ptr, i32 %hi, i32 %lo) nounwind {
+; CHECK-LABEL: test_xrstor64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: pushq %rbx # encoding: [0x53]
+; CHECK-NEXT: movl %edx, %r16d # encoding: [0xd5,0x10,0x89,0xd0]
+; CHECK-NEXT: movl %esi, %edx # encoding: [0x89,0xf2]
+; CHECK-NEXT: movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop # encoding: [0x90]
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl %r16d, %eax # encoding: [0xd5,0x40,0x89,0xc0]
+; CHECK-NEXT: xrstor64 (%rbx) # encoding: [0x48,0x0f,0xae,0x2b]
+; CHECK-NEXT: popq %rbx # encoding: [0x5b]
+; CHECK-NEXT: retq # encoding: [0xc3]
+ tail call void asm sideeffect "nop", "~{eax},~{ecx},~{esi},~{edi},~{r8},~{r9},~{r10},~{r11}"()
call void @llvm.x86.xrstor64(ptr %ptr, i32 %hi, i32 %lo)
ret void;
}
diff --git a/llvm/test/CodeGen/X86/apx/setzucc.ll b/llvm/test/CodeGen/X86/apx/setzucc.ll
index 6eb2d69..d32ccf8 100644
--- a/llvm/test/CodeGen/X86/apx/setzucc.ll
+++ b/llvm/test/CodeGen/X86/apx/setzucc.ll
@@ -89,3 +89,15 @@ bb1:
bb2:
ret i32 0
}
+
+define i32 @highmask_i32_mask32(i32 %val) {
+; CHECK-LABEL: highmask_i32_mask32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: testl $-1048576, %edi # imm = 0xFFF00000
+; CHECK-NEXT: setzune %al
+; CHECK-NEXT: retq
+ %and = and i32 %val, -1048576
+ %cmp = icmp ne i32 %and, 0
+ %ret = zext i1 %cmp to i32
+ ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
index b4d40fe..71887e3 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test.ll
@@ -2156,15 +2156,17 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no
; X64-LABEL: atomic_shl1_mask01_xor_16_gpr_brz:
; X64: # %bb.0: # %entry
; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: movl %ecx, %edx
; X64-NEXT: andb $15, %cl
-; X64-NEXT: movl $1, %edx
-; X64-NEXT: shll %cl, %edx
+; X64-NEXT: movl $1, %esi
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NEXT: shll %cl, %esi
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: .p2align 4
; X64-NEXT: .LBB34_1: # %atomicrmw.start
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: movl %eax, %ecx
-; X64-NEXT: xorl %edx, %ecx
+; X64-NEXT: xorl %esi, %ecx
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: lock cmpxchgw %cx, (%rdi)
; X64-NEXT: # kill: def $ax killed $ax def $eax
@@ -2172,12 +2174,12 @@ define zeroext i16 @atomic_shl1_mask01_xor_16_gpr_brz(ptr %v, i16 zeroext %c) no
; X64-NEXT: # %bb.2: # %atomicrmw.end
; X64-NEXT: movzwl %ax, %ecx
; X64-NEXT: movw $123, %ax
-; X64-NEXT: testl %ecx, %edx
+; X64-NEXT: testl %ecx, %esi
; X64-NEXT: je .LBB34_3
; X64-NEXT: # %bb.4: # %return
; X64-NEXT: retq
; X64-NEXT: .LBB34_3: # %if.then
-; X64-NEXT: movzwl %si, %eax
+; X64-NEXT: movzwl %dx, %eax
; X64-NEXT: movzwl (%rdi,%rax,2), %eax
; X64-NEXT: retq
entry:
@@ -3398,10 +3400,12 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n
; X64-LABEL: atomic_shl1_mask01_and_16_gpr_brnz:
; X64: # %bb.0: # %entry
; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: movl %ecx, %edx
; X64-NEXT: andb $15, %cl
-; X64-NEXT: movl $1, %edx
-; X64-NEXT: shll %cl, %edx
+; X64-NEXT: movl $1, %esi
+; X64-NEXT: shll %cl, %esi
; X64-NEXT: movl $-2, %r8d
+; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: roll %cl, %r8d
; X64-NEXT: movzwl (%rdi), %eax
; X64-NEXT: .p2align 4
@@ -3415,10 +3419,10 @@ define zeroext i16 @atomic_shl1_mask01_and_16_gpr_brnz(ptr %v, i16 zeroext %c) n
; X64-NEXT: jne .LBB52_1
; X64-NEXT: # %bb.2: # %atomicrmw.end
; X64-NEXT: movzwl %ax, %eax
-; X64-NEXT: testl %eax, %edx
+; X64-NEXT: testl %eax, %esi
; X64-NEXT: je .LBB52_3
; X64-NEXT: # %bb.4: # %if.then
-; X64-NEXT: movzwl %si, %eax
+; X64-NEXT: movzwl %dx, %eax
; X64-NEXT: movzwl (%rdi,%rax,2), %eax
; X64-NEXT: retq
; X64-NEXT: .LBB52_3:
diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
index 105ee7f..e118f5d 100644
--- a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
+++ b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
@@ -46,8 +46,9 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x
; CHECK-NEXT: orl %edx, %eax
; CHECK-NEXT: lock cmpxchgl %ecx, (%rbx)
; CHECK-NEXT: setne %cl
-; CHECK-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-NEXT: movl %eax, %edx
; CHECK-NEXT: shrl $16, %eax
+; CHECK-NEXT: pinsrw $0, %edx, %xmm0
; CHECK-NEXT: pinsrw $0, %eax, %xmm1
; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: jne .LBB0_1
diff --git a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
index 76d84c1..860d60f 100644
--- a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
@@ -97,3 +97,99 @@ define <16 x i32>@test_int_x86_avx10_vpdpbuuds_512(<16 x i32> %x0, <16 x i32> %x
%res = call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
ret <16 x i32> %res
}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwsud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwsud_512:
+; X86: # %bb.0:
+; X86-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwsud_512:
+; X64: # %bb.0:
+; X64-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwsuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwsuds_512:
+; X86: # %bb.0:
+; X86-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwsuds_512:
+; X64: # %bb.0:
+; X64-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwusd_512:
+; X86: # %bb.0:
+; X86-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwusd_512:
+; X64: # %bb.0:
+; X64-NEXT: vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwusds_512:
+; X86: # %bb.0:
+; X86-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwusds_512:
+; X64: # %bb.0:
+; X64-NEXT: vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwuud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwuud_512:
+; X86: # %bb.0:
+; X86-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwuud_512:
+; X64: # %bb.0:
+; X64-NEXT: vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwuuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwuuds_512:
+; X86: # %bb.0:
+; X86-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwuuds_512:
+; X64: # %bb.0:
+; X64-NEXT: vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index 79849a7..d9b4635 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
; X86: # %bb.0:
-; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index a2aad60..e9c6cb6 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -220,7 +220,7 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8
; VNNI INT16
-define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) {
; X86-LABEL: test_mm512_dpwsud_epi32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -231,12 +231,12 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
; X64: # %bb.0:
; X64-NEXT: vpdpwsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x07]
; X64-NEXT: retq # encoding: [0xc3]
- %__B = load <16 x i32>, ptr %pB
- %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %__B = load <32 x i16>, ptr %pB
+ %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
ret <16 x i32> %res
}
-define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) {
; X86-LABEL: test_mm512_mask_dpwsuds_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -248,13 +248,13 @@ define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %_
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
%bst = bitcast i16 %__U to <16 x i1>
%res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
ret <16 x i32> %res
}
-define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) {
; X86-LABEL: test_mm512_maskz_dpwsud_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -266,14 +266,14 @@ define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %_
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+ %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
%bst = bitcast i16 %__U to <16 x i1>
%res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
ret <16 x i32> %res
}
-declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
; X86-LABEL: test_mm512_dpwusd_epi32:
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 0f2c75b..01b7618 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
; X86: # %bb.0:
-; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 1f270d5..bf7f937 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -334,7 +334,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
; VNNI INT16
-define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) {
; X86-LABEL: test_mm_mask_dpwsud_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -346,13 +346,13 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
%bst = bitcast i4 %__U to <4 x i1>
%res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
ret <4 x i32> %res
}
-define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) {
; X86-LABEL: test_mm_maskz_dpwsuds_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -364,13 +364,13 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+ %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
%bst = bitcast i4 %__U to <4 x i1>
%res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
ret <4 x i32> %res
}
-define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) {
; X86-LABEL: test_mm256_maskz_dpwsuds_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -382,13 +382,13 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
%bst = bitcast i8 %__U to <8 x i1>
%res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
ret <8 x i32> %res
}
-define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) {
; X86-LABEL: test_mm256_mask_dpwsud_epi32:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -400,16 +400,16 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
; X64-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2]
; X64-NEXT: retq # encoding: [0xc3]
- %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+ %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
%bst = bitcast i8 %__U to <8 x i1>
%res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
; X86-LABEL: test_mm_mask_dpwusd_epi32:
diff --git a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
index 38d54cf..00db1fb 100644
--- a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
@@ -652,14 +652,14 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128(<2 x double> %x0, <4 x i32
; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8]
+; X64-NEXT: vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8]
; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; X64-NEXT: retq # encoding: [0xc3]
;
; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8]
+; X86-NEXT: vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8]
; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
; X86-NEXT: retl # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> %src, i8 %mask)
@@ -670,13 +670,13 @@ define <4 x i32> @test_int_x86_maskz_vcvtt_pd2udqs_128_z(<2 x double> %x0, i8 %m
; X64-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0]
+; X64-NEXT: vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
;
; X86-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0]
+; X86-NEXT: vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> zeroinitializer, i8 %mask)
ret <4 x i32> %res
@@ -686,13 +686,13 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128_undef(<2 x double> %x0, i8
; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0]
+; X64-NEXT: vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0]
; X64-NEXT: retq # encoding: [0xc3]
;
; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0]
+; X86-NEXT: vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0]
; X86-NEXT: retl # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> undef, i8 %mask)
ret <4 x i32> %res
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 1133cdfd..d21df472 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -121,14 +121,13 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
; CHECK-LABEL: mul_v32i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm3
-; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3
-; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm2
+; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2
+; CHECK-NEXT: vpandn %ymm1, %ymm3, %ymm1
; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0
-; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
%x = mul <32 x i8> %i, %j
ret <32 x i8> %x
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
index 77053e2..4dd883a 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin-deprecated.ll
@@ -255,8 +255,8 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf)
; CHECK-LABEL: gather_qps:
; CHECK: ## %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
-; CHECK-NEXT: kxnorw %k0, %k0, %k2
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k2
; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
@@ -520,7 +520,7 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, ptr %x1,
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -772,7 +772,7 @@ define void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, <
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -788,7 +788,7 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <
; CHECK: ## %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -800,9 +800,9 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <
define void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
; CHECK-LABEL: scatter_mask_test:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT: kxorw %k0, %k0, %k1
+; CHECK-NEXT: kxorb %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: kmovd %eax, %k1
diff --git a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index df71e3c..5ed91ea 100644
--- a/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/llvm/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -251,9 +251,9 @@ define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8
define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) {
; CHECK-LABEL: gather_qps:
; CHECK: # %bb.0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: kxnorw %k0, %k0, %k2
+; CHECK-NEXT: kxnorb %k0, %k0, %k2
; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
@@ -523,7 +523,7 @@ define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, ptr
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
@@ -774,7 +774,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -789,7 +789,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i
; CHECK: # %bb.0:
; CHECK-NEXT: kmovd %esi, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -802,9 +802,9 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i
define dso_local void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
; CHECK-LABEL: scatter_mask_test:
; CHECK: # %bb.0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
-; CHECK-NEXT: kxorw %k0, %k0, %k1
+; CHECK-NEXT: kxorb %k0, %k0, %k1
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
; CHECK-NEXT: movb $1, %al
; CHECK-NEXT: kmovd %eax, %k1
@@ -856,7 +856,7 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %b
define <8 x float> @gather_global(<8 x i64>, ptr nocapture readnone) {
; CHECK-LABEL: gather_global:
; CHECK: # %bb.0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1}
; CHECK-NEXT: vmovaps %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir
new file mode 100644
index 0000000..0d8f217
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-i386-setallones-pseudo.mir
@@ -0,0 +1,26 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+# RUN: llc %s -mtriple=i386-- -start-before=postrapseudos -o - | FileCheck %s
+
+--- |
+ target triple = "i386-unknown-unknown"
+
+ define void @setallones() #0 {
+ ; CHECK-LABEL: setallones:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
+ ; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+ entry:
+ unreachable
+ }
+
+ attributes #0 = { "target-features"="+avx512f,+avx512vl" }
+---
+name: setallones
+tracksRegLiveness: true
+liveins: []
+body: |
+ bb.0:
+ $xmm0 = AVX512_128_SETALLONES
+ $ymm1 = AVX512_256_SETALLONES
+
+...
diff --git a/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
new file mode 100644
index 0000000..ca5f319
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
@@ -0,0 +1,229 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQBW
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
+declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
+declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
+
+; Test case 1: Direct v8i1 all-ones mask (should use kxnorb on AVX512DQ)
+define <8 x float> @mask_v8i1_allones(ptr %ptr) {
+; AVX512F-LABEL: mask_v8i1_allones:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movw $255, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: mask_v8i1_allones:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: mask_v8i1_allones:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movw $255, %ax
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: mask_v8i1_allones:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQBW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512DQBW-NEXT: retq
+ %res = call <8 x float> @llvm.masked.expandload.v8f32(ptr %ptr, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> zeroinitializer)
+ ret <8 x float> %res
+}
+
+; Test case 2: v16i1 with lower 8 bits set via bitconvert (should use kxnorb on AVX512DQ)
+define <16 x float> @mask_v16i1_lower8(ptr %ptr) {
+; AVX512F-LABEL: mask_v16i1_lower8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movw $255, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: mask_v16i1_lower8:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: mask_v16i1_lower8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movw $255, %ax
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: mask_v16i1_lower8:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512DQBW-NEXT: retq
+ %res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> zeroinitializer)
+ ret <16 x float> %res
+}
+
+; Test case 3: v16i1 with all bits set (should use kxnorw on all targets)
+define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512-LABEL: gather_all:
+; AVX512: # %bb.0:
+; AVX512-NEXT: kxnorw %k0, %k0, %k1
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512-NEXT: vmovaps %zmm1, %zmm0
+; AVX512-NEXT: retq
+ %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+ %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison)
+ ret <16 x float> %res
+}
+
+; Test case 4: v8i1 with lower 8 bits set in gather (should use kxnorb on AVX512DQ targets)
+define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
+; AVX512F-LABEL: gather_lower:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: movw $255, %ax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512F-NEXT: vmovaps %zmm1, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: gather_lower:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: gather_lower:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: movw $255, %ax
+; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: gather_lower:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
+; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
+; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0
+; AVX512DQBW-NEXT: retq
+ %broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
+ %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
+ %sext_ind = sext <16 x i32> %ind to <16 x i64>
+ %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
+ %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> poison)
+ ret <16 x float> %res
+}
+
+; Test case 5: v32i1 mask via bitconvert combined with dynamic condition.
+; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle.
+define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
+; AVX512F-LABEL: mask_v32i1_lower16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: mask_v32i1_lower16:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: mask_v32i1_lower16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movl $65535, %eax # imm = 0xFFFF
+; AVX512BW-NEXT: kmovd %eax, %k0
+; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: kord %k0, %k1, %k1
+; AVX512BW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: mask_v32i1_lower16:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k0
+; AVX512DQBW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1
+; AVX512DQBW-NEXT: kord %k0, %k1, %k1
+; AVX512DQBW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512DQBW-NEXT: retq
+ %mask0 = bitcast i32 65535 to <32 x i1>
+ %mask1 = icmp sgt <32 x i16> %c, %d
+ %mask = or <32 x i1> %mask0, %mask1
+ %res = select <32 x i1> %mask, <32 x i16> %a, <32 x i16> %b
+ ret <32 x i16> %res
+}
+
+; Test case 6: v64i1 mask via bitconvert combined with dynamic condition.
+; Verifies the KSET1D submask pattern survives past SelectionDAG combines.
+define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
+; AVX512F-LABEL: mask_v64i1_lower32:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512F-NEXT: retq
+;
+; AVX512DQ-LABEL: mask_v64i1_lower32:
+; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2
+; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
+; AVX512DQ-NEXT: retq
+;
+; AVX512BW-LABEL: mask_v64i1_lower32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
+; AVX512BW-NEXT: kmovq %rax, %k0
+; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: korq %k0, %k1, %k1
+; AVX512BW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: retq
+;
+; AVX512DQBW-LABEL: mask_v64i1_lower32:
+; AVX512DQBW: # %bb.0:
+; AVX512DQBW-NEXT: kxnord %k0, %k0, %k0
+; AVX512DQBW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1
+; AVX512DQBW-NEXT: korq %k0, %k1, %k1
+; AVX512DQBW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512DQBW-NEXT: retq
+ %mask0 = bitcast i64 4294967295 to <64 x i1>
+ %mask1 = icmp sgt <64 x i8> %c, %d
+ %mask = or <64 x i1> %mask0, %mask1
+ %res = select <64 x i1> %mask, <64 x i8> %a, <64 x i8> %b
+ ret <64 x i8> %res
+}
+
diff --git a/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir
new file mode 100644
index 0000000..7e5ddc4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512-setallones-pseudo.mir
@@ -0,0 +1,30 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+# RUN: llc %s -mtriple=x86_64-- -start-before=postrapseudos -o - | FileCheck %s
+
+--- |
+ target triple = "x86_64-unknown-unknown"
+
+ define void @setallones() #0 {
+ ; CHECK-LABEL: setallones:
+ ; CHECK: # %bb.0:
+ ; CHECK-NEXT: vpcmpeqd %xmm14, %xmm14, %xmm14
+ ; CHECK-NEXT: vpternlogd {{.*#+}} xmm16 = -1
+ ; CHECK-NEXT: vpcmpeqd %ymm15, %ymm15, %ymm15
+ ; CHECK-NEXT: vpternlogd {{.*#+}} ymm17 = -1
+ entry:
+ unreachable
+ }
+
+ attributes #0 = { "target-features"="+avx512f,+avx512vl" }
+---
+name: setallones
+tracksRegLiveness: true
+liveins: []
+body: |
+ bb.0:
+ $xmm14 = AVX512_128_SETALLONES
+ $xmm16 = AVX512_128_SETALLONES
+ $ymm15 = AVX512_256_SETALLONES
+ $ymm17 = AVX512_256_SETALLONES
+
+...
diff --git a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index a24c1d8..7fb2041 100644
--- a/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/llvm/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -52,13 +52,12 @@ define <8 x i1> @test3(<4 x i1> %a) {
define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test4:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
-; CHECK-NEXT: vpmovd2m %xmm1, %k0
-; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
-; CHECK-NEXT: vpmovd2m %xmm0, %k1
-; CHECK-NEXT: kshiftlb $4, %k0, %k0
-; CHECK-NEXT: korb %k0, %k1, %k0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpslld $31, %ymm0, %ymm0
+; CHECK-NEXT: vpmovd2m %ymm0, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -68,13 +67,12 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test5:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1
-; CHECK-NEXT: vpmovq2m %xmm1, %k0
-; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
-; CHECK-NEXT: vpmovq2m %xmm0, %k1
-; CHECK-NEXT: kshiftlb $2, %k0, %k0
-; CHECK-NEXT: korw %k0, %k1, %k0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vpsllq $63, %ymm0, %ymm0
+; CHECK-NEXT: vpmovq2m %ymm0, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
index b8ebe2a..ddf0050 100644
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -178,18 +178,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
ret { <4 x i32>, <4 x i32> } %res2
}
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
-define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x52,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
ret <8 x i32> %1
}
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -209,11 +209,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
; X64-NEXT: vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <8 x i32>, ptr %x2p
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %x2 = load <16 x i16>, ptr %x2p
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
- %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
%res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -221,18 +221,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
ret { <8 x i32>, <8 x i32> } %res2
}
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
-define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x52,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
ret <4 x i32> %1
}
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -252,12 +252,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
; X64-NEXT: vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <8 x i16>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
- %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
@@ -266,18 +266,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
ret { <4 x i32>, <4 x i32> } %res2
}
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
-define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x53,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
ret <8 x i32> %1
}
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -297,11 +297,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
; X64-NEXT: vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
; X64-NEXT: vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <8 x i32>, ptr %x2p
- %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %x2 = load <16 x i16>, ptr %x2p
+ %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
- %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+ %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
%res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -309,9 +309,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
ret { <8 x i32>, <8 x i32> } %res2
}
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
-define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p) {
; X86-LABEL: test_int_x86_avx512_vpdpwssds_128:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -322,12 +322,12 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
; X64: # %bb.0:
; X64-NEXT: vpdpwssds (%rdi), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x53,0x07]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <8 x i16>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
ret <4 x i32> %1
}
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
; X86: # %bb.0:
; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -347,12 +347,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
; X64-NEXT: vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
; X64-NEXT: vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <4 x i32>, ptr %x2p
- %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %x2 = load <8 x i16>, ptr %x2p
+ %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
%2 = bitcast i8 %x3 to <8 x i1>
%extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
- %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+ %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
%5 = bitcast i8 %x3 to <8 x i1>
%extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
index 63ff88a..2aabfab 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
@@ -102,21 +102,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpbusds_512(<16 x
ret { <16 x i32>, <16 x i32> } %res3
}
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
-; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
+define <16 x i32>@test_int_x86_avx512_vpdpwssd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_vpdpwssd:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
ret <16 x i32> %res
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ ret <16 x i32> %res
+}
+
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
+; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
@@ -125,7 +143,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X86-NEXT: retl # encoding: [0xc3]
;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
; X64: # %bb.0:
; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
@@ -141,21 +159,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
ret { <16 x i32>, <16 x i32> } %res3
}
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+ %res = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
ret <16 x i32> %res
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
; X86: # %bb.0:
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2]
+; X86-NEXT: retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; X64: # %bb.0:
+; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2]
+; X64-NEXT: retq # encoding: [0xc3]
+ %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+ ret <16 x i32> %res
+}
+
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
+; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
@@ -164,7 +200,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; X86-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X86-NEXT: retl # encoding: [0xc3]
;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
; X64: # %bb.0:
; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
index 60d0298..e97b8a5 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
@@ -86,18 +86,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
ret { <16 x i32>, <16 x i32> } %res2
}
-declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <32 x i16>, <32 x i16>)
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
ret <16 x i32> %1
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -116,11 +116,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
; X64-NEXT: vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda]
; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <16 x i32>, ptr %x2p
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %x2 = load <32 x i16>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
- %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+ %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
%res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
@@ -128,18 +128,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
ret { <16 x i32>, <16 x i32> } %res2
}
-declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <32 x i16>, <32 x i16>)
-define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
; CHECK-LABEL: test_int_x86_avx512_ask_vpdpwssds_512:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
ret <16 x i32> %1
}
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
; X86: # %bb.0:
; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -158,11 +158,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
; X64-NEXT: vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda]
; X64-NEXT: vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
; X64-NEXT: retq # encoding: [0xc3]
- %x2 = load <16 x i32>, ptr %x2p
- %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+ %x2 = load <32 x i16>, ptr %x2p
+ %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
%2 = bitcast i16 %x3 to <16 x i1>
%3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
- %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+ %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
%5 = bitcast i16 %x3 to <16 x i1>
%6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
%res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
index 0f4a4f2..f359ece 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
@@ -45,3 +45,47 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8
%res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
ret <8 x i32> %res
}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ ret <8 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
index de8b2a4..5748a42 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
@@ -68,9 +68,9 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <1
ret <4 x i32> %res
}
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
-define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
@@ -80,13 +80,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
-define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
@@ -96,13 +96,13 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
ret <4 x i32> %res
}
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
-define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
@@ -112,13 +112,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+ %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
ret <8 x i32> %res
}
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
-define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
@@ -128,6 +128,6 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+ %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
ret <4 x i32> %res
}
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll
new file mode 100644
index 0000000..abdc296
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
+; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; AVX10: # %bb.0:
+; AVX10-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
+; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
index abdc296..7576b12 100644
--- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
@@ -4,7 +4,7 @@
; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
@@ -14,12 +14,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
ret <4 x i32> %ret
}
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
@@ -29,12 +29,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
ret <8 x i32> %ret
}
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
@@ -44,12 +44,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
ret <4 x i32> %ret
}
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
@@ -59,12 +59,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
ret <8 x i32> %ret
}
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
-define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
@@ -74,12 +74,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
ret <4 x i32> %ret
}
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
-define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
@@ -89,12 +89,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
ret <8 x i32> %ret
}
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
-define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
@@ -104,12 +104,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
ret <4 x i32> %ret
}
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
-define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
@@ -119,12 +119,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
ret <8 x i32> %ret
}
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
-define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
@@ -134,12 +134,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
ret <4 x i32> %ret
}
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
-define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
@@ -149,12 +149,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
ret <8 x i32> %ret
}
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
-define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
@@ -164,12 +164,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
ret <4 x i32> %ret
}
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
-define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
@@ -179,7 +179,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8
; AVX10: # %bb.0:
; AVX10-NEXT: vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
; AVX10-NEXT: ret{{[l|q]}} # encoding: [0xc3]
- %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
ret <8 x i32> %ret
}
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll
new file mode 100644
index 0000000..293b48d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-bb-hash.ll
@@ -0,0 +1,39 @@
+;; BB section test with basic block hashes.
+
+;; basic block sections Profile with bb hashes
+; RUN: echo 'v1' > %t
+; RUN: echo 'f foo' >> %t
+; RUN: echo 'g 0:10,1:9,2:1 1:8,3:8 2:2,3:2 3:11' >> %t
+; RUN: echo 'c 0 2 3' >> %t
+; RUN: echo 'h 0:64863A11B5CA0000 1:54F1E80D6B270006 2:54F1F4E66B270008 3:C8BC6041A2CB0009' >> %t
+; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t | FileCheck %s
+;
+define void @foo(i1 zeroext) nounwind {
+ %2 = alloca i8, align 1
+ %3 = zext i1 %0 to i8
+ store i8 %3, ptr %2, align 1
+ %4 = load i8, ptr %2, align 1
+ %5 = trunc i8 %4 to i1
+ br i1 %5, label %6, label %8
+
+6: ; preds = %1
+ %7 = call i32 @bar()
+ br label %10
+
+8: ; preds = %1
+ %9 = call i32 @baz()
+ br label %10
+
+10: ; preds = %8, %6
+ ret void
+}
+
+declare i32 @bar() #1
+
+declare i32 @baz() #1
+
+; CHECK: .section .text.foo,"ax",@progbits
+; CHECK: callq baz
+; CHECK: retq
+; CHECK: .section .text.split.foo,"ax",@progbits
+; CHECK: callq bar
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll
new file mode 100644
index 0000000..6fe7bf5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-bb-hash.ll
@@ -0,0 +1,93 @@
+; BB cluster section tests when using edges profile and basic blocks hashes to generate clusters.
+; In the tests, we first generate hash values for basic blocks and write them to the profile.
+; When generating basic blocks clusters, we match the hashes of basic blocks in the current CFG
+; with those in the profile. After a successful match, we retrieve the weights of the basic blocks
+; and edges from the profile. Subsequently, we use an inference algorithm to deduce the complete
+; weights of all basic blocks and edges. Finally, we generate "hot" and "cold" clusters based on
+; these complete weights.
+; In Test 1 and Test 2, the weights of basic blocks and edges in the profiles are different, which
+; will ultimately result in distinct cluster partitioning outcomes.
+;
+; RUN: llc %s -O0 -mtriple=x86_64-pc-linux -function-sections -filetype=obj -basic-block-address-map -emit-bb-hash -o %t.o
+;
+; Test1: Basic blocks #0 (entry), #1 and #3 will be placed in the same section.
+; The rest will be placed in the cold section.
+;
+; RUN: echo 'v1' > %t1
+; RUN: echo 'f foo' >> %t1
+; RUN: echo 'g 0:100,1:100,2:0 1:100,3:100 2:0,3:0 3:100' >> %t1
+;
+; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP
+; and put them into the basic blocks sections profile.
+; RUN: llvm-readobj %t.o --bb-addr-map | \
+; RUN: awk 'BEGIN {printf "h"} \
+; RUN: /ID: [0-9]+/ {id=$2} \
+; RUN: /Hash: 0x[0-9A-Fa-f]+/ {gsub(/^0x/, "", $2); hash=$2; printf " %%s:%%s", id, hash} \
+; RUN: END {print ""}' \
+; RUN: >> %t1
+;
+; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -basic-block-section-match-infer | \
+; RUN: FileCheck %s -check-prefixes=CHECK,LINUX-SECTIONS1
+;
+; Test2: Basic #0 (entry), #2 and #3 will be placed in the same section.
+; The rest will be placed in the cold section.
+;
+; RUN: echo 'v1' > %t2
+; RUN: echo 'f foo' >> %t2
+; RUN: echo 'g 0:100,1:0,2:100 1:0,3:0 2:100,3:100 3:100' >> %t2
+;
+; These commands read BB hashes from SHT_LLVM_BB_ADDR_MAP
+; and put them into the basic blocks sections profile.
+; RUN: llvm-readobj %t.o --bb-addr-map | \
+; RUN: awk 'BEGIN {printf "h"} \
+; RUN: /ID: [0-9]+/ {id=$2} \
+; RUN: /Hash: 0x[0-9A-Fa-f]+/ {gsub(/^0x/, "", $2); hash=$2; printf " %%s:%%s", id, hash} \
+; RUN: END {print ""}' \
+; RUN: >> %t2
+;
+; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -basic-block-section-match-infer | \
+; RUN: FileCheck %s -check-prefixes=CHECK,LINUX-SECTIONS2
+
+define void @foo(i1 zeroext) nounwind {
+ %2 = alloca i8, align 1
+ %3 = zext i1 %0 to i8
+ store i8 %3, ptr %2, align 1
+ %4 = load i8, ptr %2, align 1
+ %5 = trunc i8 %4 to i1
+ br i1 %5, label %6, label %8
+
+6: ; preds = %1
+ %7 = call i32 @bar()
+ br label %10
+
+8: ; preds = %1
+ %9 = call i32 @baz()
+ br label %10
+
+10: ; preds = %8, %6
+ ret void
+}
+
+declare i32 @bar() #1
+
+declare i32 @baz() #1
+
+; CHECK: .section .text.foo,"ax",@progbits
+; CHECK-NOT: .section
+; CHECK-LABEL: foo:
+; CHECK-NOT: .section
+; CHECK-NOT: .LBB_END0_{{0-9}}+
+; LINUX-SECTIONS1-LABEL: # %bb.1:
+; LINUX-SECTIONS2-LABEL: # %bb.2:
+; CHECK-NOT: .section
+; CHECK-NOT: .LBB_END0_{{0-9}}+
+; CHECK-LABEL: .LBB0_3:
+; CHECK-LABEL: .LBB_END0_3:
+; CHECK-NEXT: .section .text.split.foo,"ax",@progbits
+; CHECK-LABEL: foo.cold:
+; LINUX-SECTIONS1-LABEL: .LBB_END0_2:
+; LINUX-SECTIONS2-LABEL: .LBB_END0_1:
+; LINUX-SECTIONS1-LABEL: .size foo.cold, .LBB_END0_2-foo.cold
+; LINUX-SECTIONS2-LABEL: .size foo.cold, .LBB_END0_1-foo.cold
+; CHECK-LABEL: .Lfunc_end0:
+; CHECK-NEXT: .size foo, .Lfunc_end0-foo
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll
index 751ab76..eb0a14b 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll
@@ -69,6 +69,20 @@
; RUN: echo 'g 0:4,1:2:3' >> %t15
; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t15 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR15
; CHECK-ERROR15: LLVM ERROR: invalid profile {{.*}} at line 4: unsigned integer expected: '2:3'
+; RUN: echo 'v1' > %t16
+; RUN: echo 'f dummy1' >> %t16
+; RUN: echo 'c 0 1' >> %t16
+; RUN: echo 'g 0:4,1:2' >> %t16
+; RUN: echo 'h a:1111111111111111 1:ffffffffffffffff' >> %t16
+; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t16 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR16
+; CHECK-ERROR16: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected: 'a'
+; RUN: echo 'v1' > %t17
+; RUN: echo 'f dummy1' >> %t17
+; RUN: echo 'c 0 1' >> %t17
+; RUN: echo 'g 0:4,1:2' >> %t17
+; RUN: echo 'h 0:111111111111111g 1:ffffffffffffffff' >> %t17
+; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t17 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR17
+; CHECK-ERROR17: LLVM ERROR: invalid profile {{.*}} at line 5: unsigned integer expected in hex format: '111111111111111g'
define i32 @dummy1(i32 %x, i32 %y, i32 %z) {
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-list.ll b/llvm/test/CodeGen/X86/basic-block-sections-list.ll
index 45ef452..d171821 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-list.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-list.ll
@@ -1,17 +1,13 @@
-;; Check the basic block sections list option.
-;; version 0 profile:
-; RUN: echo '!_Z3foob' > %t1
+;; Check that specifying the function in the basic block sections profile
+;; without any other directives is a noop.
;;
-;; version 1 profile:
-; RUN: echo 'v1' > %t2
-; RUN: echo 'f _Z3foob' >> %t2
+;; Specify the bb sections profile:
+; RUN: echo 'v1' > %t
+; RUN: echo 'f _Z3foob' >> %t
;;
-; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t1 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX
-; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names | FileCheck %s -check-prefix=LINUX-SECTIONS --check-prefix=LINUX-SECTIONS-NO-FUNCTION-SECTION
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t2 -unique-basic-block-section-names --bbsections-guided-section-prefix=false | FileCheck %s -check-prefix=LINUX-SECTIONS-NO-GUIDED-PREFIX
+; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t > %t.bbsections
+; RUN: llc < %s -mtriple=x86_64-pc-linux -function-sections > %t.orig
+; RUN: diff -u %t.orig %t.bbsections
define i32 @_Z3foob(i1 zeroext %0) nounwind {
%2 = alloca i32, align 4
@@ -41,45 +37,3 @@ define i32 @_Z3foob(i1 zeroext %0) nounwind {
declare i32 @_Z3barv() #1
declare i32 @_Z3bazv() #1
-
-define i32 @_Z3zipb(i1 zeroext %0) nounwind {
- %2 = alloca i32, align 4
- %3 = alloca i8, align 1
- %4 = zext i1 %0 to i8
- store i8 %4, ptr %3, align 1
- %5 = load i8, ptr %3, align 1
- %6 = trunc i8 %5 to i1
- %7 = zext i1 %6 to i32
- %8 = icmp sgt i32 %7, 0
- br i1 %8, label %9, label %11
-
-9: ; preds = %1
- %10 = call i32 @_Z3barv()
- store i32 %10, ptr %2, align 4
- br label %13
-
-11: ; preds = %1
- %12 = call i32 @_Z3bazv()
- store i32 %12, ptr %2, align 4
- br label %13
-
-13: ; preds = %11, %9
- %14 = load i32, ptr %2, align 4
- ret i32 %14
-}
-
-; LINUX-SECTIONS-NO-GUIDED-PREFIX: .section .text._Z3foob,"ax",@progbits
-; LINUX-SECTIONS: .section .text.hot._Z3foob,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob:
-; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.1,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob.__part.1:
-; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.2,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob.__part.2:
-; LINUX-SECTIONS: .section .text.hot._Z3foob._Z3foob.__part.3,"ax",@progbits
-; LINUX-SECTIONS: _Z3foob.__part.3:
-
-; LINUX-SECTIONS-FUNCTION-SECTION: .section .text._Z3zipb,"ax",@progbits
-; LINUX-SECTIONS-NO-FUNCTION-SECTION-NOT: .section .text{{.*}}._Z3zipb,"ax",@progbits
-; LINUX-SECTIONS: _Z3zipb:
-; LINUX-SECTIONS-NOT: .section .text{{.*}}._Z3zipb.__part.{{[0-9]+}},"ax",@progbits
-; LINUX-SECTIONS-NOT: _Z3zipb.__part.{{[0-9]+}}:
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll
index d481b14..6e0db20 100644
--- a/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll
+++ b/llvm/test/CodeGen/X86/basic-block-sections-source-drift.ll
@@ -1,6 +1,8 @@
-; RUN: echo "!foo" > %t.order.txt
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt | FileCheck --check-prefix=SOURCE-DRIFT %s
-; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t.order.txt -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s
+; RUN: echo "v1" > %t
+; RUN: echo "f foo" >> %t
+; RUN: echo "c 0" >> %t
+; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t | FileCheck --check-prefix=SOURCE-DRIFT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -basic-block-sections=%t -bbsections-detect-source-drift=false | FileCheck --check-prefix=HASH-CHECK-DISABLED %s
define dso_local i32 @foo(i1 zeroext %0, i1 zeroext %1) !annotation !1 {
br i1 %0, label %5, label %3
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 684e292..7bccd6b 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,BF16
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,F16,FP16
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=CHECK,AVX,BF16,AVXNC
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64,SSE2
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512vl | FileCheck %s --check-prefixes=X64,AVX,AVX512,AVX512BF16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avx512bf16,avx512fp16,avx512vl | FileCheck %s --check-prefixes=X64,AVX,AVX512,AVX512FP16
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert,f16c | FileCheck %s --check-prefixes=X64,AVX,AVXNC
define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
; X86-LABEL: add:
@@ -39,18 +39,18 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; F16-LABEL: add:
-; F16: # %bb.0:
-; F16-NEXT: movzwl (%rsi), %eax
-; F16-NEXT: shll $16, %eax
-; F16-NEXT: vmovd %eax, %xmm0
-; F16-NEXT: movzwl (%rdi), %eax
-; F16-NEXT: shll $16, %eax
-; F16-NEXT: vmovd %eax, %xmm1
-; F16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; F16-NEXT: vpextrw $0, %xmm0, (%rdx)
-; F16-NEXT: retq
+; AVX512-LABEL: add:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movzwl (%rsi), %eax
+; AVX512-NEXT: shll $16, %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: movzwl (%rdi), %eax
+; AVX512-NEXT: shll $16, %eax
+; AVX512-NEXT: vmovd %eax, %xmm1
+; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rdx)
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: add:
; AVXNC: # %bb.0:
@@ -98,17 +98,29 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; FP16-LABEL: add2:
-; FP16: # %bb.0:
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: vmovw %xmm1, %ecx
-; FP16-NEXT: shll $16, %ecx
-; FP16-NEXT: vmovd %ecx, %xmm0
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: add2:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax
+; AVX512BF16-NEXT: vpextrw $0, %xmm1, %ecx
+; AVX512BF16-NEXT: shll $16, %ecx
+; AVX512BF16-NEXT: vmovd %ecx, %xmm0
+; AVX512BF16-NEXT: shll $16, %eax
+; AVX512BF16-NEXT: vmovd %eax, %xmm1
+; AVX512BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: add2:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vmovw %xmm0, %eax
+; AVX512FP16-NEXT: vmovw %xmm1, %ecx
+; AVX512FP16-NEXT: shll $16, %ecx
+; AVX512FP16-NEXT: vmovd %ecx, %xmm0
+; AVX512FP16-NEXT: shll $16, %eax
+; AVX512FP16-NEXT: vmovd %eax, %xmm1
+; AVX512FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512FP16-NEXT: retq
;
; AVXNC-LABEL: add2:
; AVXNC: # %bb.0:
@@ -189,34 +201,63 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; FP16-LABEL: add_double:
-; FP16: # %bb.0:
-; FP16-NEXT: pushq %rbp
-; FP16-NEXT: pushq %r14
-; FP16-NEXT: pushq %rbx
-; FP16-NEXT: movq %rdx, %rbx
-; FP16-NEXT: movq %rsi, %r14
-; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovw %xmm0, %ebp
-; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: shll $16, %ebp
-; FP16-NEXT: vmovd %ebp, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; FP16-NEXT: vmovsd %xmm0, (%rbx)
-; FP16-NEXT: popq %rbx
-; FP16-NEXT: popq %r14
-; FP16-NEXT: popq %rbp
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: add_double:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: pushq %rbp
+; AVX512BF16-NEXT: pushq %r14
+; AVX512BF16-NEXT: pushq %rbx
+; AVX512BF16-NEXT: movq %rdx, %rbx
+; AVX512BF16-NEXT: movq %rsi, %r14
+; AVX512BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %ebp
+; AVX512BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax
+; AVX512BF16-NEXT: shll $16, %eax
+; AVX512BF16-NEXT: vmovd %eax, %xmm0
+; AVX512BF16-NEXT: shll $16, %ebp
+; AVX512BF16-NEXT: vmovd %ebp, %xmm1
+; AVX512BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512BF16-NEXT: vmovd %xmm0, %eax
+; AVX512BF16-NEXT: shll $16, %eax
+; AVX512BF16-NEXT: vmovd %eax, %xmm0
+; AVX512BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512BF16-NEXT: vmovsd %xmm0, (%rbx)
+; AVX512BF16-NEXT: popq %rbx
+; AVX512BF16-NEXT: popq %r14
+; AVX512BF16-NEXT: popq %rbp
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: add_double:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: pushq %rbp
+; AVX512FP16-NEXT: pushq %r14
+; AVX512FP16-NEXT: pushq %rbx
+; AVX512FP16-NEXT: movq %rdx, %rbx
+; AVX512FP16-NEXT: movq %rsi, %r14
+; AVX512FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovw %xmm0, %ebp
+; AVX512FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovw %xmm0, %eax
+; AVX512FP16-NEXT: shll $16, %eax
+; AVX512FP16-NEXT: vmovd %eax, %xmm0
+; AVX512FP16-NEXT: shll $16, %ebp
+; AVX512FP16-NEXT: vmovd %ebp, %xmm1
+; AVX512FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512FP16-NEXT: vmovw %xmm0, %eax
+; AVX512FP16-NEXT: shll $16, %eax
+; AVX512FP16-NEXT: vmovd %eax, %xmm0
+; AVX512FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512FP16-NEXT: vmovsd %xmm0, (%rbx)
+; AVX512FP16-NEXT: popq %rbx
+; AVX512FP16-NEXT: popq %r14
+; AVX512FP16-NEXT: popq %rbp
+; AVX512FP16-NEXT: retq
;
; AVXNC-LABEL: add_double:
; AVXNC: # %bb.0:
@@ -310,30 +351,55 @@ define double @add_double2(double %da, double %db) nounwind {
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; FP16-LABEL: add_double2:
-; FP16: # %bb.0:
-; FP16-NEXT: pushq %rbx
-; FP16-NEXT: subq $16, %rsp
-; FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovw %xmm0, %ebx
-; FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; FP16-NEXT: # xmm0 = mem[0],zero
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: shll $16, %ebx
-; FP16-NEXT: vmovd %ebx, %xmm1
-; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
-; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; FP16-NEXT: addq $16, %rsp
-; FP16-NEXT: popq %rbx
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: add_double2:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: pushq %rbx
+; AVX512BF16-NEXT: subq $16, %rsp
+; AVX512BF16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %ebx
+; AVX512BF16-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload
+; AVX512BF16-NEXT: # xmm0 = mem[0],zero
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax
+; AVX512BF16-NEXT: shll $16, %eax
+; AVX512BF16-NEXT: vmovd %eax, %xmm0
+; AVX512BF16-NEXT: shll $16, %ebx
+; AVX512BF16-NEXT: vmovd %ebx, %xmm1
+; AVX512BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512BF16-NEXT: vmovd %xmm0, %eax
+; AVX512BF16-NEXT: shll $16, %eax
+; AVX512BF16-NEXT: vmovd %eax, %xmm0
+; AVX512BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512BF16-NEXT: addq $16, %rsp
+; AVX512BF16-NEXT: popq %rbx
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: add_double2:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: pushq %rbx
+; AVX512FP16-NEXT: subq $16, %rsp
+; AVX512FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovw %xmm0, %ebx
+; AVX512FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
+; AVX512FP16-NEXT: # xmm0 = mem[0],zero
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovw %xmm0, %eax
+; AVX512FP16-NEXT: shll $16, %eax
+; AVX512FP16-NEXT: vmovd %eax, %xmm0
+; AVX512FP16-NEXT: shll $16, %ebx
+; AVX512FP16-NEXT: vmovd %ebx, %xmm1
+; AVX512FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512FP16-NEXT: vmovw %xmm0, %eax
+; AVX512FP16-NEXT: shll $16, %eax
+; AVX512FP16-NEXT: vmovd %eax, %xmm0
+; AVX512FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512FP16-NEXT: addq $16, %rsp
+; AVX512FP16-NEXT: popq %rbx
+; AVX512FP16-NEXT: retq
;
; AVXNC-LABEL: add_double2:
; AVXNC: # %bb.0:
@@ -393,15 +459,15 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind {
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: retq
;
-; F16-LABEL: add_constant:
-; F16: # %bb.0:
-; F16-NEXT: movzwl (%rdi), %eax
-; F16-NEXT: shll $16, %eax
-; F16-NEXT: vmovd %eax, %xmm0
-; F16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; F16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; F16-NEXT: vpextrw $0, %xmm0, (%rsi)
-; F16-NEXT: retq
+; AVX512-LABEL: add_constant:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movzwl (%rdi), %eax
+; AVX512-NEXT: shll $16, %eax
+; AVX512-NEXT: vmovd %eax, %xmm0
+; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: add_constant:
; AVXNC: # %bb.0:
@@ -439,14 +505,23 @@ define bfloat @add_constant2(bfloat %a) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; FP16-LABEL: add_constant2:
-; FP16: # %bb.0:
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: add_constant2:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax
+; AVX512BF16-NEXT: shll $16, %eax
+; AVX512BF16-NEXT: vmovd %eax, %xmm0
+; AVX512BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: add_constant2:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vmovw %xmm0, %eax
+; AVX512FP16-NEXT: shll $16, %eax
+; AVX512FP16-NEXT: vmovd %eax, %xmm0
+; AVX512FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512FP16-NEXT: retq
;
; AVXNC-LABEL: add_constant2:
; AVXNC: # %bb.0:
@@ -467,10 +542,10 @@ define void @store_constant(ptr %pc) nounwind {
; X86-NEXT: movw $16256, (%eax) # imm = 0x3F80
; X86-NEXT: retl
;
-; CHECK-LABEL: store_constant:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movw $16256, (%rdi) # imm = 0x3F80
-; CHECK-NEXT: retq
+; X64-LABEL: store_constant:
+; X64: # %bb.0:
+; X64-NEXT: movw $16256, (%rdi) # imm = 0x3F80
+; X64-NEXT: retq
store bfloat 1.0, ptr %pc
ret void
}
@@ -484,11 +559,11 @@ define void @fold_ext_trunc(ptr %pa, ptr %pc) nounwind {
; X86-NEXT: movw %cx, (%eax)
; X86-NEXT: retl
;
-; CHECK-LABEL: fold_ext_trunc:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movzwl (%rdi), %eax
-; CHECK-NEXT: movw %ax, (%rsi)
-; CHECK-NEXT: retq
+; X64-LABEL: fold_ext_trunc:
+; X64: # %bb.0:
+; X64-NEXT: movzwl (%rdi), %eax
+; X64-NEXT: movw %ax, (%rsi)
+; X64-NEXT: retq
%a = load bfloat, ptr %pa
%ext = fpext bfloat %a to float
%trunc = fptrunc float %ext to bfloat
@@ -502,9 +577,9 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: retl
;
-; CHECK-LABEL: fold_ext_trunc2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: retq
+; X64-LABEL: fold_ext_trunc2:
+; X64: # %bb.0:
+; X64-NEXT: retq
%ext = fpext bfloat %a to float
%trunc = fptrunc float %ext to bfloat
ret bfloat %trunc
@@ -526,11 +601,17 @@ define bfloat @fold_from_half(half %a) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; FP16-LABEL: fold_from_half:
-; FP16: # %bb.0:
-; FP16-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
-; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: fold_from_half:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: fold_from_half:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0
+; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512FP16-NEXT: retq
;
; AVXNC-LABEL: fold_from_half:
; AVXNC: # %bb.0:
@@ -561,21 +642,29 @@ define half @fold_to_half(bfloat %a) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; BF16-LABEL: fold_to_half:
-; BF16: # %bb.0:
-; BF16-NEXT: vpextrw $0, %xmm0, %eax
-; BF16-NEXT: shll $16, %eax
-; BF16-NEXT: vmovd %eax, %xmm0
-; BF16-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; BF16-NEXT: retq
-;
-; FP16-LABEL: fold_to_half:
-; FP16: # %bb.0:
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: fold_to_half:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax
+; AVX512BF16-NEXT: shll $16, %eax
+; AVX512BF16-NEXT: vmovd %eax, %xmm0
+; AVX512BF16-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: fold_to_half:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vmovw %xmm0, %eax
+; AVX512FP16-NEXT: shll $16, %eax
+; AVX512FP16-NEXT: vmovd %eax, %xmm0
+; AVX512FP16-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0
+; AVX512FP16-NEXT: retq
+;
+; AVXNC-LABEL: fold_to_half:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
+; AVXNC-NEXT: shll $16, %eax
+; AVXNC-NEXT: vmovd %eax, %xmm0
+; AVXNC-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVXNC-NEXT: retq
%ext = fpext bfloat %a to float
%trunc = fptrunc float %ext to half
ret half %trunc
@@ -587,9 +676,9 @@ define bfloat @bitcast_from_half(half %a) nounwind {
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: retl
;
-; CHECK-LABEL: bitcast_from_half:
-; CHECK: # %bb.0:
-; CHECK-NEXT: retq
+; X64-LABEL: bitcast_from_half:
+; X64: # %bb.0:
+; X64-NEXT: retq
%bc = bitcast half %a to bfloat
ret bfloat %bc
}
@@ -600,9 +689,9 @@ define half @bitcast_to_half(bfloat %a) nounwind {
; X86-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
; X86-NEXT: retl
;
-; CHECK-LABEL: bitcast_to_half:
-; CHECK: # %bb.0:
-; CHECK-NEXT: retq
+; X64-LABEL: bitcast_to_half:
+; X64: # %bb.0:
+; X64-NEXT: retq
%bc = bitcast bfloat %a to half
ret half %bc
}
@@ -753,16 +842,16 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; F16-LABEL: addv:
-; F16: # %bb.0:
-; F16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; F16-NEXT: vpslld $16, %ymm1, %ymm1
-; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; F16-NEXT: vpslld $16, %ymm0, %ymm0
-; F16-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
-; F16-NEXT: vzeroupper
-; F16-NEXT: retq
+; AVX512-LABEL: addv:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT: vpslld $16, %ymm1, %ymm1
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpslld $16, %ymm0, %ymm0
+; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vcvtneps2bf16 %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: addv:
; AVXNC: # %bb.0:
@@ -791,16 +880,22 @@ define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE2-NEXT: retq
;
-; BF16-LABEL: pr62997:
-; BF16: # %bb.0:
-; BF16-NEXT: vpextrw $0, %xmm1, %eax
-; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
-; BF16-NEXT: retq
+; AVX512BF16-LABEL: pr62997:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: vpextrw $0, %xmm1, %eax
+; AVX512BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX512BF16-NEXT: retq
;
-; FP16-LABEL: pr62997:
-; FP16: # %bb.0:
-; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; FP16-NEXT: retq
+; AVX512FP16-LABEL: pr62997:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512FP16-NEXT: retq
+;
+; AVXNC-LABEL: pr62997:
+; AVXNC: # %bb.0:
+; AVXNC-NEXT: vpextrw $0, %xmm1, %eax
+; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
+; AVXNC-NEXT: retq
%1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0
%2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1
ret <2 x bfloat> %2
@@ -820,10 +915,10 @@ define <32 x bfloat> @pr63017() {
; SSE2-NEXT: xorps %xmm3, %xmm3
; SSE2-NEXT: retq
;
-; F16-LABEL: pr63017:
-; F16: # %bb.0:
-; F16-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; F16-NEXT: retq
+; AVX512-LABEL: pr63017:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: pr63017:
; AVXNC: # %bb.0:
@@ -1077,11 +1172,17 @@ define <32 x bfloat> @pr63017_2() nounwind {
; SSE2-NEXT: popq %r14
; SSE2-NEXT: retq
;
-; FP16-LABEL: pr63017_2:
-; FP16: # %bb.0:
-; FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
-; FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: pr63017_2:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
+; AVX512BF16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: pr63017_2:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0]
+; AVX512FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1}
+; AVX512FP16-NEXT: retq
;
; AVXNC-LABEL: pr63017_2:
; AVXNC: # %bb.0:
@@ -1118,12 +1219,19 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) {
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
; SSE2-NEXT: retq
;
-; FP16-LABEL: pr62997_3:
-; FP16: # %bb.0:
-; FP16-NEXT: vmovw %xmm1, %eax
-; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1
-; FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: pr62997_3:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: vpextrw $0, %xmm1, %eax
+; AVX512BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1
+; AVX512BF16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: pr62997_3:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vmovw %xmm1, %eax
+; AVX512FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1
+; AVX512FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512FP16-NEXT: retq
;
; AVXNC-LABEL: pr62997_3:
; AVXNC: # %bb.0:
@@ -1206,11 +1314,11 @@ define <16 x float> @pr64460_3(<16 x bfloat> %a) {
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: retq
;
-; F16-LABEL: pr64460_3:
-; F16: # %bb.0:
-; F16-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; F16-NEXT: vpslld $16, %zmm0, %zmm0
-; F16-NEXT: retq
+; AVX512-LABEL: pr64460_3:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT: vpslld $16, %zmm0, %zmm0
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: pr64460_3:
; AVXNC: # %bb.0:
@@ -1248,12 +1356,12 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) {
; SSE2-NEXT: movaps %xmm4, %xmm0
; SSE2-NEXT: retq
;
-; F16-LABEL: pr64460_4:
-; F16: # %bb.0:
-; F16-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; F16-NEXT: vpslld $16, %ymm0, %ymm0
-; F16-NEXT: vcvtps2pd %ymm0, %zmm0
-; F16-NEXT: retq
+; AVX512-LABEL: pr64460_4:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vpslld $16, %ymm0, %ymm0
+; AVX512-NEXT: vcvtps2pd %ymm0, %zmm0
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: pr64460_4:
; AVXNC: # %bb.0:
@@ -1301,12 +1409,12 @@ define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind {
; SSE2-NEXT: addq $72, %rsp
; SSE2-NEXT: retq
;
-; F16-LABEL: fptrunc_v4f32:
-; F16: # %bb.0:
-; F16-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
-; F16-NEXT: vzeroupper
-; F16-NEXT: retq
+; AVX512-LABEL: fptrunc_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vcvtneps2bf16 %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: fptrunc_v4f32:
; AVXNC: # %bb.0:
@@ -1387,11 +1495,11 @@ define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; F16-LABEL: fptrunc_v8f32:
-; F16: # %bb.0:
-; F16-NEXT: vcvtneps2bf16 %ymm0, %xmm0
-; F16-NEXT: vzeroupper
-; F16-NEXT: retq
+; AVX512-LABEL: fptrunc_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtneps2bf16 %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: fptrunc_v8f32:
; AVXNC: # %bb.0:
@@ -1526,10 +1634,10 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; F16-LABEL: fptrunc_v16f32:
-; F16: # %bb.0:
-; F16-NEXT: vcvtneps2bf16 %zmm0, %ymm0
-; F16-NEXT: retq
+; AVX512-LABEL: fptrunc_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vcvtneps2bf16 %zmm0, %ymm0
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: fptrunc_v16f32:
; AVXNC: # %bb.0:
@@ -1666,63 +1774,138 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; SSE2-NEXT: popq %rbp
; SSE2-NEXT: retq
;
-; FP16-LABEL: fptrunc_v8f64:
-; FP16: # %bb.0:
-; FP16-NEXT: subq $184, %rsp
-; FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0
-; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; FP16-NEXT: vzeroupper
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; FP16-NEXT: # xmm0 = mem[1,0]
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; FP16-NEXT: vzeroupper
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0
-; FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; FP16-NEXT: vzeroupper
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; FP16-NEXT: vzeroupper
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; FP16-NEXT: callq __truncdfbf2@PLT
-; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; FP16-NEXT: addq $184, %rsp
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: fptrunc_v8f64:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: pushq %rbp
+; AVX512BF16-NEXT: pushq %r15
+; AVX512BF16-NEXT: pushq %r14
+; AVX512BF16-NEXT: pushq %r13
+; AVX512BF16-NEXT: pushq %r12
+; AVX512BF16-NEXT: pushq %rbx
+; AVX512BF16-NEXT: subq $184, %rsp
+; AVX512BF16-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BF16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512BF16-NEXT: vzeroupper
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BF16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512BF16-NEXT: # xmm0 = mem[1,0]
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BF16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BF16-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BF16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512BF16-NEXT: vzeroupper
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BF16-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512BF16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512BF16-NEXT: vzeroupper
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BF16-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512BF16-NEXT: # xmm0 = mem[1,0]
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BF16-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BF16-NEXT: vzeroupper
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BF16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512BF16-NEXT: # xmm0 = mem[1,0]
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %ebx
+; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %ebp
+; AVX512BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %r14d
+; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %r15d
+; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %r12d
+; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %r13d
+; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512BF16-NEXT: callq __truncdfbf2@PLT
+; AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax
+; AVX512BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512BF16-NEXT: vpinsrw $1, %r13d, %xmm0, %xmm0
+; AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX512BF16-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0
+; AVX512BF16-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0
+; AVX512BF16-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0
+; AVX512BF16-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0
+; AVX512BF16-NEXT: vpinsrw $7, %ebx, %xmm0, %xmm0
+; AVX512BF16-NEXT: addq $184, %rsp
+; AVX512BF16-NEXT: popq %rbx
+; AVX512BF16-NEXT: popq %r12
+; AVX512BF16-NEXT: popq %r13
+; AVX512BF16-NEXT: popq %r14
+; AVX512BF16-NEXT: popq %r15
+; AVX512BF16-NEXT: popq %rbp
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: fptrunc_v8f64:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: subq $184, %rsp
+; AVX512FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512FP16-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512FP16-NEXT: vzeroupper
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX512FP16-NEXT: # xmm0 = mem[1,0]
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512FP16-NEXT: vzeroupper
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0
+; AVX512FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512FP16-NEXT: vzeroupper
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512FP16-NEXT: vzeroupper
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512FP16-NEXT: callq __truncdfbf2@PLT
+; AVX512FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX512FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX512FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
+; AVX512FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512FP16-NEXT: addq $184, %rsp
+; AVX512FP16-NEXT: retq
;
; AVXNC-LABEL: fptrunc_v8f64:
; AVXNC: # %bb.0:
@@ -1817,10 +2000,10 @@ define <32 x bfloat> @test_v8bf16_v32bf16(ptr %0) {
; SSE2-NEXT: movaps %xmm0, %xmm3
; SSE2-NEXT: retq
;
-; F16-LABEL: test_v8bf16_v32bf16:
-; F16: # %bb.0:
-; F16-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; F16-NEXT: retq
+; AVX512-LABEL: test_v8bf16_v32bf16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-NEXT: retq
;
; AVXNC-LABEL: test_v8bf16_v32bf16:
; AVXNC: # %bb.0:
@@ -1959,13 +2142,21 @@ define float @trunc_ext(float %a) nounwind {
; SSE2-NEXT: popq %rax
; SSE2-NEXT: retq
;
-; FP16-LABEL: trunc_ext:
-; FP16: # %bb.0:
-; FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
-; FP16-NEXT: vmovw %xmm0, %eax
-; FP16-NEXT: shll $16, %eax
-; FP16-NEXT: vmovd %eax, %xmm0
-; FP16-NEXT: retq
+; AVX512BF16-LABEL: trunc_ext:
+; AVX512BF16: # %bb.0:
+; AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512BF16-NEXT: vmovd %xmm0, %eax
+; AVX512BF16-NEXT: shll $16, %eax
+; AVX512BF16-NEXT: vmovd %eax, %xmm0
+; AVX512BF16-NEXT: retq
+;
+; AVX512FP16-LABEL: trunc_ext:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
+; AVX512FP16-NEXT: vmovw %xmm0, %eax
+; AVX512FP16-NEXT: shll $16, %eax
+; AVX512FP16-NEXT: vmovd %eax, %xmm0
+; AVX512FP16-NEXT: retq
;
; AVXNC-LABEL: trunc_ext:
; AVXNC: # %bb.0:
@@ -2042,14 +2233,14 @@ define bfloat @PR108936(x86_fp80 %0) nounwind {
; X86-NEXT: addl $12, %esp
; X86-NEXT: retl
;
-; CHECK-LABEL: PR108936:
-; CHECK: # %bb.0:
-; CHECK-NEXT: subq $24, %rsp
-; CHECK-NEXT: fldt {{[0-9]+}}(%rsp)
-; CHECK-NEXT: fstpt (%rsp)
-; CHECK-NEXT: callq __truncxfbf2@PLT
-; CHECK-NEXT: addq $24, %rsp
-; CHECK-NEXT: retq
+; X64-LABEL: PR108936:
+; X64: # %bb.0:
+; X64-NEXT: subq $24, %rsp
+; X64-NEXT: fldt {{[0-9]+}}(%rsp)
+; X64-NEXT: fstpt (%rsp)
+; X64-NEXT: callq __truncxfbf2@PLT
+; X64-NEXT: addq $24, %rsp
+; X64-NEXT: retq
%2 = fptrunc x86_fp80 %0 to bfloat
ret bfloat %2
}
@@ -2064,12 +2255,12 @@ define bfloat @PR115710(fp128 %0) nounwind {
; X86-NEXT: addl $28, %esp
; X86-NEXT: retl
;
-; CHECK-LABEL: PR115710:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushq %rax
-; CHECK-NEXT: callq __trunctfbf2@PLT
-; CHECK-NEXT: popq %rax
-; CHECK-NEXT: retq
+; X64-LABEL: PR115710:
+; X64: # %bb.0:
+; X64-NEXT: pushq %rax
+; X64-NEXT: callq __trunctfbf2@PLT
+; X64-NEXT: popq %rax
+; X64-NEXT: retq
%2 = fptrunc fp128 %0 to bfloat
ret bfloat %2
}
diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
index 86d7df0c..fae1ff9 100644
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@@ -216,8 +216,8 @@ define i1 @trunc_v8i16_cmp(<8 x i16> %a0) nounwind {
define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
; SSE-LABEL: bitcast_v16i8_to_v2i8:
; SSE: # %bb.0:
-; SSE-NEXT: pmovmskb %xmm0, %ecx
-; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: # kill: def $al killed $al killed $eax
@@ -225,8 +225,8 @@ define i8 @bitcast_v16i8_to_v2i8(<16 x i8> %a0) nounwind {
;
; AVX12-LABEL: bitcast_v16i8_to_v2i8:
; AVX12: # %bb.0:
-; AVX12-NEXT: vpmovmskb %xmm0, %ecx
-; AVX12-NEXT: movl %ecx, %eax
+; AVX12-NEXT: vpmovmskb %xmm0, %eax
+; AVX12-NEXT: movl %eax, %ecx
; AVX12-NEXT: shrl $8, %eax
; AVX12-NEXT: addb %cl, %al
; AVX12-NEXT: # kill: def $al killed $al killed $eax
@@ -441,8 +441,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
; SSE-LABEL: bitcast_v16i16_to_v2i8:
; SSE: # %bb.0:
; SSE-NEXT: packsswb %xmm1, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %ecx
-; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: # kill: def $al killed $al killed $eax
@@ -452,8 +452,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %ecx
-; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: shrl $8, %eax
; AVX1-NEXT: addb %cl, %al
; AVX1-NEXT: # kill: def $al killed $al killed $eax
@@ -464,8 +464,8 @@ define i8 @bitcast_v16i16_to_v2i8(<16 x i16> %a0) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpmovmskb %xmm0, %ecx
-; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $8, %eax
; AVX2-NEXT: addb %cl, %al
; AVX2-NEXT: # kill: def $al killed $al killed $eax
@@ -762,8 +762,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
; SSE-NEXT: packssdw %xmm3, %xmm2
; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: packsswb %xmm2, %xmm0
-; SSE-NEXT: pmovmskb %xmm0, %ecx
-; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: movl %eax, %ecx
; SSE-NEXT: shrl $8, %eax
; SSE-NEXT: addb %cl, %al
; SSE-NEXT: # kill: def $al killed $al killed $eax
@@ -776,8 +776,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovmskb %xmm0, %ecx
-; AVX1-NEXT: movl %ecx, %eax
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: shrl $8, %eax
; AVX1-NEXT: addb %cl, %al
; AVX1-NEXT: # kill: def $al killed $al killed $eax
@@ -793,8 +793,8 @@ define i8 @bitcast_v16i32_to_v2i8(<16 x i32> %a0) nounwind {
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-NEXT: vpmovmskb %xmm0, %ecx
-; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $8, %eax
; AVX2-NEXT: addb %cl, %al
; AVX2-NEXT: # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 13149d7..749b3dd 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -1,8 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=CHECK,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefixes=AVX512,AVX512POPCNT
;
; CTPOP
@@ -16,6 +17,14 @@ define i32 @test_ctpop_i128(i128 %a0) nounwind {
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
+;
+; AVX512-LABEL: test_ctpop_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: popcntq %rsi, %rcx
+; AVX512-NEXT: popcntq %rdi, %rax
+; AVX512-NEXT: addl %ecx, %eax
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
%cnt = call i128 @llvm.ctpop.i128(i128 %a0)
%res = trunc i128 %cnt to i32
ret i32 %res
@@ -29,12 +38,77 @@ define i32 @load_ctpop_i128(ptr %p0) nounwind {
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
+;
+; AVX512-LABEL: load_ctpop_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: popcntq 8(%rdi), %rcx
+; AVX512-NEXT: popcntq (%rdi), %rax
+; AVX512-NEXT: addl %ecx, %eax
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
%a0 = load i128, ptr %p0
%cnt = call i128 @llvm.ctpop.i128(i128 %a0)
%res = trunc i128 %cnt to i32
ret i32 %res
}
+define i32 @vector_ctpop_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: popcntq %rcx, %rcx
+; SSE-NEXT: popcntq %rax, %rax
+; SSE-NEXT: addl %ecx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctpop_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: popcntq %rax, %rdx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: popcntq %rcx, %rax
+; AVX2-NEXT: addl %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctpop_i128:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vmovq %xmm0, %rcx
+; AVX512F-NEXT: popcntq %rax, %rdx
+; AVX512F-NEXT: popcntq %rcx, %rax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctpop_i128:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: popcntq %rcx, %rcx
+; AVX512VL-NEXT: popcntq %rax, %rax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i128:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT: popcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.ctpop.i128(i128 %a0)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctpop_i256(i256 %a0) nounwind {
; CHECK-LABEL: test_ctpop_i256:
; CHECK: # %bb.0:
@@ -50,6 +124,48 @@ define i32 @test_ctpop_i256(i256 %a0) nounwind {
; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
+;
+; AVX512F-LABEL: test_ctpop_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: popcntq %rcx, %rax
+; AVX512F-NEXT: popcntq %rdx, %rcx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: popcntq %rsi, %rdx
+; AVX512F-NEXT: popcntq %rdi, %rax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: addl %ecx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_ctpop_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: popcntq %rcx, %rax
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: popcntq %rdx, %rcx
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: popcntq %rsi, %rdx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq %rdi, %rax
+; AVX512VL-NEXT: addl %edx, %eax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_ctpop_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: popcntq %rcx, %rax
+; AVX512POPCNT-NEXT: xorl %ecx, %ecx
+; AVX512POPCNT-NEXT: popcntq %rdx, %rcx
+; AVX512POPCNT-NEXT: addl %eax, %ecx
+; AVX512POPCNT-NEXT: xorl %edx, %edx
+; AVX512POPCNT-NEXT: popcntq %rsi, %rdx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq %rdi, %rax
+; AVX512POPCNT-NEXT: addl %edx, %eax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
%cnt = call i256 @llvm.ctpop.i256(i256 %a0)
%res = trunc i256 %cnt to i32
ret i32 %res
@@ -81,24 +197,150 @@ define i32 @load_ctpop_i256(ptr %p0) nounwind {
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_ctpop_i256:
-; AVX512: # %bb.0:
-; AVX512-NEXT: popcntq 24(%rdi), %rax
-; AVX512-NEXT: popcntq 16(%rdi), %rcx
-; AVX512-NEXT: addl %eax, %ecx
-; AVX512-NEXT: popcntq 8(%rdi), %rdx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq (%rdi), %rax
-; AVX512-NEXT: addl %edx, %eax
-; AVX512-NEXT: addl %ecx, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_ctpop_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: popcntq 24(%rdi), %rax
+; AVX512F-NEXT: popcntq 16(%rdi), %rcx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: popcntq 8(%rdi), %rdx
+; AVX512F-NEXT: popcntq (%rdi), %rax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: addl %ecx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctpop_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: popcntq 24(%rdi), %rax
+; AVX512VL-NEXT: popcntq 16(%rdi), %rcx
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: popcntq 8(%rdi), %rdx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq (%rdi), %rax
+; AVX512VL-NEXT: addl %edx, %eax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctpop_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: popcntq 24(%rdi), %rax
+; AVX512POPCNT-NEXT: popcntq 16(%rdi), %rcx
+; AVX512POPCNT-NEXT: addl %eax, %ecx
+; AVX512POPCNT-NEXT: popcntq 8(%rdi), %rdx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq (%rdi), %rax
+; AVX512POPCNT-NEXT: addl %edx, %eax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
%a0 = load i256, ptr %p0
%cnt = call i256 @llvm.ctpop.i256(i256 %a0)
%res = trunc i256 %cnt to i32
ret i32 %res
}
+define i32 @vector_ctpop_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rcx
+; SSE-NEXT: movq %xmm1, %rdx
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: popcntq %rsi, %rsi
+; SSE-NEXT: popcntq %rdx, %rdx
+; SSE-NEXT: addl %esi, %edx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: popcntq %rax, %rsi
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: popcntq %rcx, %rax
+; SSE-NEXT: addl %esi, %eax
+; SSE-NEXT: addl %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctpop_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: vmovq %xmm0, %rsi
+; AVX2-NEXT: popcntq %rdx, %rdx
+; AVX2-NEXT: popcntq %rsi, %rsi
+; AVX2-NEXT: addl %edx, %esi
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: popcntq %rax, %rdx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: popcntq %rcx, %rax
+; AVX2-NEXT: addl %edx, %eax
+; AVX2-NEXT: addl %esi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctpop_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT: vmovq %xmm0, %rcx
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rsi
+; AVX512F-NEXT: popcntq %rdx, %rdx
+; AVX512F-NEXT: popcntq %rsi, %rsi
+; AVX512F-NEXT: addl %edx, %esi
+; AVX512F-NEXT: popcntq %rax, %rdx
+; AVX512F-NEXT: popcntq %rcx, %rax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: addl %esi, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctpop_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT: vmovq %xmm0, %rcx
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq %xmm0, %rdx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT: popcntq %rsi, %rsi
+; AVX512VL-NEXT: popcntq %rdx, %rdx
+; AVX512VL-NEXT: addl %esi, %edx
+; AVX512VL-NEXT: xorl %esi, %esi
+; AVX512VL-NEXT: popcntq %rax, %rsi
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq %rcx, %rax
+; AVX512VL-NEXT: addl %esi, %eax
+; AVX512VL-NEXT: addl %edx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rcx
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT: popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT: popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT: addl %esi, %edx
+; AVX512POPCNT-NEXT: xorl %esi, %esi
+; AVX512POPCNT-NEXT: popcntq %rax, %rsi
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq %rcx, %rax
+; AVX512POPCNT-NEXT: addl %esi, %eax
+; AVX512POPCNT-NEXT: addl %edx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <8 x i32> %v0 to i256
+ %cnt = call i256 @llvm.ctpop.i256(i256 %a0)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctpop_i512(i512 %a0) nounwind {
; CHECK-LABEL: test_ctpop_i512:
; CHECK: # %bb.0:
@@ -124,6 +366,76 @@ define i32 @test_ctpop_i512(i512 %a0) nounwind {
; CHECK-NEXT: addl %r8d, %eax
; CHECK-NEXT: # kill: def $eax killed $eax killed $rax
; CHECK-NEXT: retq
+;
+; AVX512F-LABEL: test_ctpop_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: addl %eax, %r10d
+; AVX512F-NEXT: popcntq %r9, %rax
+; AVX512F-NEXT: popcntq %r8, %r8
+; AVX512F-NEXT: addl %eax, %r8d
+; AVX512F-NEXT: addl %r10d, %r8d
+; AVX512F-NEXT: popcntq %rcx, %rax
+; AVX512F-NEXT: popcntq %rdx, %rcx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: popcntq %rsi, %rdx
+; AVX512F-NEXT: popcntq %rdi, %rax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: addl %ecx, %eax
+; AVX512F-NEXT: addl %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_ctpop_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: addl %eax, %r10d
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq %r9, %rax
+; AVX512VL-NEXT: popcntq %r8, %r8
+; AVX512VL-NEXT: addl %eax, %r8d
+; AVX512VL-NEXT: addl %r10d, %r8d
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq %rcx, %rax
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: popcntq %rdx, %rcx
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: popcntq %rsi, %rdx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq %rdi, %rax
+; AVX512VL-NEXT: addl %edx, %eax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: addl %r8d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_ctpop_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512POPCNT-NEXT: addl %eax, %r10d
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq %r9, %rax
+; AVX512POPCNT-NEXT: popcntq %r8, %r8
+; AVX512POPCNT-NEXT: addl %eax, %r8d
+; AVX512POPCNT-NEXT: addl %r10d, %r8d
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq %rcx, %rax
+; AVX512POPCNT-NEXT: xorl %ecx, %ecx
+; AVX512POPCNT-NEXT: popcntq %rdx, %rcx
+; AVX512POPCNT-NEXT: addl %eax, %ecx
+; AVX512POPCNT-NEXT: xorl %edx, %edx
+; AVX512POPCNT-NEXT: popcntq %rsi, %rdx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq %rdi, %rax
+; AVX512POPCNT-NEXT: addl %edx, %eax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: addl %r8d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
%cnt = call i512 @llvm.ctpop.i512(i512 %a0)
%res = trunc i512 %cnt to i32
ret i32 %res
@@ -177,35 +489,239 @@ define i32 @load_ctpop_i512(ptr %p0) nounwind {
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_ctpop_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: popcntq 56(%rdi), %rax
-; AVX512-NEXT: popcntq 48(%rdi), %rcx
-; AVX512-NEXT: addl %eax, %ecx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq 40(%rdi), %rax
-; AVX512-NEXT: popcntq 32(%rdi), %rdx
-; AVX512-NEXT: addl %eax, %edx
-; AVX512-NEXT: addl %ecx, %edx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq 24(%rdi), %rax
-; AVX512-NEXT: xorl %ecx, %ecx
-; AVX512-NEXT: popcntq 16(%rdi), %rcx
-; AVX512-NEXT: popcntq 8(%rdi), %rsi
-; AVX512-NEXT: addl %eax, %ecx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq (%rdi), %rax
-; AVX512-NEXT: addl %esi, %eax
-; AVX512-NEXT: addl %ecx, %eax
-; AVX512-NEXT: addl %edx, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_ctpop_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: popcntq 56(%rdi), %rax
+; AVX512F-NEXT: popcntq 48(%rdi), %rcx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: popcntq 40(%rdi), %rax
+; AVX512F-NEXT: popcntq 32(%rdi), %rdx
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: addl %ecx, %edx
+; AVX512F-NEXT: popcntq 24(%rdi), %rcx
+; AVX512F-NEXT: popcntq 16(%rdi), %rsi
+; AVX512F-NEXT: popcntq 8(%rdi), %r8
+; AVX512F-NEXT: popcntq (%rdi), %rax
+; AVX512F-NEXT: addl %ecx, %esi
+; AVX512F-NEXT: addl %r8d, %eax
+; AVX512F-NEXT: addl %esi, %eax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctpop_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: popcntq 56(%rdi), %rax
+; AVX512VL-NEXT: popcntq 48(%rdi), %rcx
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq 40(%rdi), %rax
+; AVX512VL-NEXT: popcntq 32(%rdi), %rdx
+; AVX512VL-NEXT: addl %eax, %edx
+; AVX512VL-NEXT: addl %ecx, %edx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq 24(%rdi), %rax
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: popcntq 16(%rdi), %rcx
+; AVX512VL-NEXT: popcntq 8(%rdi), %rsi
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq (%rdi), %rax
+; AVX512VL-NEXT: addl %esi, %eax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: addl %edx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctpop_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: popcntq 56(%rdi), %rax
+; AVX512POPCNT-NEXT: popcntq 48(%rdi), %rcx
+; AVX512POPCNT-NEXT: addl %eax, %ecx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq 40(%rdi), %rax
+; AVX512POPCNT-NEXT: popcntq 32(%rdi), %rdx
+; AVX512POPCNT-NEXT: addl %eax, %edx
+; AVX512POPCNT-NEXT: addl %ecx, %edx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq 24(%rdi), %rax
+; AVX512POPCNT-NEXT: xorl %ecx, %ecx
+; AVX512POPCNT-NEXT: popcntq 16(%rdi), %rcx
+; AVX512POPCNT-NEXT: popcntq 8(%rdi), %rsi
+; AVX512POPCNT-NEXT: addl %eax, %ecx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq (%rdi), %rax
+; AVX512POPCNT-NEXT: addl %esi, %eax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: addl %edx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
%a0 = load i512, ptr %p0
%cnt = call i512 @llvm.ctpop.i512(i512 %a0)
%res = trunc i512 %cnt to i32
ret i32 %res
}
+define i32 @vector_ctpop_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: movq %xmm1, %rdx
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: pextrq $1, %xmm2, %rdi
+; SSE-NEXT: movq %xmm2, %r8
+; SSE-NEXT: movq %xmm3, %r9
+; SSE-NEXT: pextrq $1, %xmm3, %r10
+; SSE-NEXT: popcntq %r10, %r10
+; SSE-NEXT: popcntq %r9, %r9
+; SSE-NEXT: addl %r10d, %r9d
+; SSE-NEXT: popcntq %rdi, %rdi
+; SSE-NEXT: popcntq %r8, %r8
+; SSE-NEXT: addl %edi, %r8d
+; SSE-NEXT: addl %r9d, %r8d
+; SSE-NEXT: popcntq %rsi, %rsi
+; SSE-NEXT: popcntq %rdx, %rdx
+; SSE-NEXT: addl %esi, %edx
+; SSE-NEXT: popcntq %rcx, %rcx
+; SSE-NEXT: popcntq %rax, %rax
+; SSE-NEXT: addl %ecx, %eax
+; SSE-NEXT: addl %edx, %eax
+; SSE-NEXT: addl %r8d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctpop_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm1, %rdi
+; AVX2-NEXT: vmovq %xmm1, %r8
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %r9
+; AVX2-NEXT: vmovq %xmm0, %r10
+; AVX2-NEXT: popcntq %r9, %r9
+; AVX2-NEXT: popcntq %r10, %r10
+; AVX2-NEXT: addl %r9d, %r10d
+; AVX2-NEXT: popcntq %rdi, %rdi
+; AVX2-NEXT: popcntq %r8, %r8
+; AVX2-NEXT: addl %edi, %r8d
+; AVX2-NEXT: addl %r10d, %r8d
+; AVX2-NEXT: popcntq %rsi, %rsi
+; AVX2-NEXT: popcntq %rdx, %rdx
+; AVX2-NEXT: addl %esi, %edx
+; AVX2-NEXT: popcntq %rcx, %rcx
+; AVX2-NEXT: popcntq %rax, %rax
+; AVX2-NEXT: addl %ecx, %eax
+; AVX2-NEXT: addl %edx, %eax
+; AVX2-NEXT: addl %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctpop_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vmovq %xmm1, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rsi
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rdi
+; AVX512F-NEXT: vmovq %xmm1, %r8
+; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512F-NEXT: vpextrq $1, %xmm0, %r9
+; AVX512F-NEXT: vmovq %xmm0, %r10
+; AVX512F-NEXT: popcntq %r9, %r9
+; AVX512F-NEXT: popcntq %r10, %r10
+; AVX512F-NEXT: addl %r9d, %r10d
+; AVX512F-NEXT: popcntq %rdi, %rdi
+; AVX512F-NEXT: popcntq %r8, %r8
+; AVX512F-NEXT: addl %edi, %r8d
+; AVX512F-NEXT: addl %r10d, %r8d
+; AVX512F-NEXT: popcntq %rdx, %rdx
+; AVX512F-NEXT: popcntq %rsi, %rsi
+; AVX512F-NEXT: addl %edx, %esi
+; AVX512F-NEXT: popcntq %rcx, %rcx
+; AVX512F-NEXT: popcntq %rax, %rax
+; AVX512F-NEXT: addl %ecx, %eax
+; AVX512F-NEXT: addl %esi, %eax
+; AVX512F-NEXT: addl %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctpop_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vmovq %xmm1, %rax
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT: vmovq %xmm0, %rsi
+; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VL-NEXT: vmovq %xmm1, %rdi
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %r8
+; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512VL-NEXT: vmovq %xmm0, %r9
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %r10
+; AVX512VL-NEXT: popcntq %r10, %r10
+; AVX512VL-NEXT: popcntq %r9, %r9
+; AVX512VL-NEXT: addl %r10d, %r9d
+; AVX512VL-NEXT: popcntq %r8, %r8
+; AVX512VL-NEXT: popcntq %rdi, %rdi
+; AVX512VL-NEXT: addl %r8d, %edi
+; AVX512VL-NEXT: addl %r9d, %edi
+; AVX512VL-NEXT: popcntq %rdx, %rdx
+; AVX512VL-NEXT: popcntq %rsi, %rsi
+; AVX512VL-NEXT: addl %edx, %esi
+; AVX512VL-NEXT: popcntq %rcx, %rcx
+; AVX512VL-NEXT: popcntq %rax, %rax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: addl %esi, %eax
+; AVX512VL-NEXT: addl %edi, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT: vmovq %xmm1, %rax
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512POPCNT-NEXT: vmovq %xmm1, %rdi
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %r8
+; AVX512POPCNT-NEXT: vextracti32x4 $3, %zmm0, %xmm0
+; AVX512POPCNT-NEXT: vmovq %xmm0, %r9
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %r10
+; AVX512POPCNT-NEXT: popcntq %r10, %r10
+; AVX512POPCNT-NEXT: popcntq %r9, %r9
+; AVX512POPCNT-NEXT: addl %r10d, %r9d
+; AVX512POPCNT-NEXT: popcntq %r8, %r8
+; AVX512POPCNT-NEXT: popcntq %rdi, %rdi
+; AVX512POPCNT-NEXT: addl %r8d, %edi
+; AVX512POPCNT-NEXT: addl %r9d, %edi
+; AVX512POPCNT-NEXT: popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT: popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT: addl %edx, %esi
+; AVX512POPCNT-NEXT: popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT: popcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: addl %esi, %eax
+; AVX512POPCNT-NEXT: addl %edi, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <16 x i32> %v0 to i512
+ %cnt = call i512 @llvm.ctpop.i512(i512 %a0)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
; SSE-LABEL: test_ctpop_i1024:
; SSE: # %bb.0:
@@ -309,57 +825,149 @@ define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
; AVX2-NEXT: popq %r14
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_ctpop_i1024:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: addl %eax, %r10d
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: addl %eax, %r11d
-; AVX512-NEXT: addl %r10d, %r11d
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: xorl %ebx, %ebx
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT: xorl %r14d, %r14d
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT: addl %eax, %ebx
-; AVX512-NEXT: xorl %r10d, %r10d
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: addl %r14d, %r10d
-; AVX512-NEXT: addl %ebx, %r10d
-; AVX512-NEXT: addl %r11d, %r10d
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: xorl %r11d, %r11d
-; AVX512-NEXT: popcntq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: addl %eax, %r11d
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq %r9, %rax
-; AVX512-NEXT: popcntq %r8, %r8
-; AVX512-NEXT: addl %eax, %r8d
-; AVX512-NEXT: addl %r11d, %r8d
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq %rcx, %rax
-; AVX512-NEXT: xorl %ecx, %ecx
-; AVX512-NEXT: popcntq %rdx, %rcx
-; AVX512-NEXT: addl %eax, %ecx
-; AVX512-NEXT: xorl %edx, %edx
-; AVX512-NEXT: popcntq %rsi, %rdx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq %rdi, %rax
-; AVX512-NEXT: addl %edx, %eax
-; AVX512-NEXT: addl %ecx, %eax
-; AVX512-NEXT: addl %r8d, %eax
-; AVX512-NEXT: addl %r10d, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_ctpop_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: addl %eax, %r10d
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: addl %eax, %r11d
+; AVX512F-NEXT: addl %r10d, %r11d
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT: addl %eax, %ebx
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: addl %r14d, %r10d
+; AVX512F-NEXT: addl %ebx, %r10d
+; AVX512F-NEXT: addl %r11d, %r10d
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: addl %eax, %r11d
+; AVX512F-NEXT: popcntq %r9, %rax
+; AVX512F-NEXT: popcntq %r8, %r8
+; AVX512F-NEXT: addl %eax, %r8d
+; AVX512F-NEXT: addl %r11d, %r8d
+; AVX512F-NEXT: popcntq %rcx, %rax
+; AVX512F-NEXT: popcntq %rdx, %rcx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: popcntq %rsi, %rdx
+; AVX512F-NEXT: popcntq %rdi, %rax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: addl %ecx, %eax
+; AVX512F-NEXT: addl %r8d, %eax
+; AVX512F-NEXT: addl %r10d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_ctpop_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %r14
+; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: addl %eax, %r10d
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: addl %eax, %r11d
+; AVX512VL-NEXT: addl %r10d, %r11d
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: xorl %ebx, %ebx
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: xorl %r14d, %r14d
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r14
+; AVX512VL-NEXT: addl %eax, %ebx
+; AVX512VL-NEXT: xorl %r10d, %r10d
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: addl %r14d, %r10d
+; AVX512VL-NEXT: addl %ebx, %r10d
+; AVX512VL-NEXT: addl %r11d, %r10d
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: xorl %r11d, %r11d
+; AVX512VL-NEXT: popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: addl %eax, %r11d
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq %r9, %rax
+; AVX512VL-NEXT: popcntq %r8, %r8
+; AVX512VL-NEXT: addl %eax, %r8d
+; AVX512VL-NEXT: addl %r11d, %r8d
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq %rcx, %rax
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: popcntq %rdx, %rcx
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: popcntq %rsi, %rdx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq %rdi, %rax
+; AVX512VL-NEXT: addl %edx, %eax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: addl %r8d, %eax
+; AVX512VL-NEXT: addl %r10d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_ctpop_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: pushq %r14
+; AVX512POPCNT-NEXT: pushq %rbx
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512POPCNT-NEXT: addl %eax, %r10d
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512POPCNT-NEXT: addl %eax, %r11d
+; AVX512POPCNT-NEXT: addl %r10d, %r11d
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512POPCNT-NEXT: xorl %ebx, %ebx
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rbx
+; AVX512POPCNT-NEXT: xorl %r14d, %r14d
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r14
+; AVX512POPCNT-NEXT: addl %eax, %ebx
+; AVX512POPCNT-NEXT: xorl %r10d, %r10d
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
+; AVX512POPCNT-NEXT: addl %r14d, %r10d
+; AVX512POPCNT-NEXT: addl %ebx, %r10d
+; AVX512POPCNT-NEXT: addl %r11d, %r10d
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
+; AVX512POPCNT-NEXT: xorl %r11d, %r11d
+; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r11
+; AVX512POPCNT-NEXT: addl %eax, %r11d
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq %r9, %rax
+; AVX512POPCNT-NEXT: popcntq %r8, %r8
+; AVX512POPCNT-NEXT: addl %eax, %r8d
+; AVX512POPCNT-NEXT: addl %r11d, %r8d
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq %rcx, %rax
+; AVX512POPCNT-NEXT: xorl %ecx, %ecx
+; AVX512POPCNT-NEXT: popcntq %rdx, %rcx
+; AVX512POPCNT-NEXT: addl %eax, %ecx
+; AVX512POPCNT-NEXT: xorl %edx, %edx
+; AVX512POPCNT-NEXT: popcntq %rsi, %rdx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq %rdi, %rax
+; AVX512POPCNT-NEXT: addl %edx, %eax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: addl %r8d, %eax
+; AVX512POPCNT-NEXT: addl %r10d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: popq %rbx
+; AVX512POPCNT-NEXT: popq %r14
+; AVX512POPCNT-NEXT: retq
%cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0)
%res = trunc i1024 %cnt to i32
ret i32 %res
@@ -460,52 +1068,135 @@ define i32 @load_ctpop_i1024(ptr %p0) nounwind {
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_ctpop_i1024:
-; AVX512: # %bb.0:
-; AVX512-NEXT: popcntq 120(%rdi), %rax
-; AVX512-NEXT: popcntq 112(%rdi), %rcx
-; AVX512-NEXT: addl %eax, %ecx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq 104(%rdi), %rax
-; AVX512-NEXT: popcntq 96(%rdi), %rdx
-; AVX512-NEXT: addl %eax, %edx
-; AVX512-NEXT: addl %ecx, %edx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq 88(%rdi), %rax
-; AVX512-NEXT: popcntq 80(%rdi), %rsi
-; AVX512-NEXT: popcntq 72(%rdi), %r8
-; AVX512-NEXT: addl %eax, %esi
-; AVX512-NEXT: xorl %ecx, %ecx
-; AVX512-NEXT: popcntq 64(%rdi), %rcx
-; AVX512-NEXT: addl %r8d, %ecx
-; AVX512-NEXT: addl %esi, %ecx
-; AVX512-NEXT: addl %edx, %ecx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq 56(%rdi), %rax
-; AVX512-NEXT: xorl %edx, %edx
-; AVX512-NEXT: popcntq 48(%rdi), %rdx
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: popcntq 40(%rdi), %rsi
-; AVX512-NEXT: addl %eax, %edx
-; AVX512-NEXT: xorl %r8d, %r8d
-; AVX512-NEXT: popcntq 32(%rdi), %r8
-; AVX512-NEXT: addl %esi, %r8d
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq 24(%rdi), %rax
-; AVX512-NEXT: addl %edx, %r8d
-; AVX512-NEXT: xorl %edx, %edx
-; AVX512-NEXT: popcntq 16(%rdi), %rdx
-; AVX512-NEXT: addl %eax, %edx
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: popcntq 8(%rdi), %rsi
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: popcntq (%rdi), %rax
-; AVX512-NEXT: addl %esi, %eax
-; AVX512-NEXT: addl %edx, %eax
-; AVX512-NEXT: addl %r8d, %eax
-; AVX512-NEXT: addl %ecx, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_ctpop_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: popcntq 120(%rdi), %rax
+; AVX512F-NEXT: popcntq 112(%rdi), %rcx
+; AVX512F-NEXT: addl %eax, %ecx
+; AVX512F-NEXT: popcntq 104(%rdi), %rax
+; AVX512F-NEXT: popcntq 96(%rdi), %rdx
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: addl %ecx, %edx
+; AVX512F-NEXT: popcntq 88(%rdi), %rax
+; AVX512F-NEXT: popcntq 80(%rdi), %rsi
+; AVX512F-NEXT: popcntq 72(%rdi), %r8
+; AVX512F-NEXT: popcntq 64(%rdi), %rcx
+; AVX512F-NEXT: addl %eax, %esi
+; AVX512F-NEXT: addl %r8d, %ecx
+; AVX512F-NEXT: addl %esi, %ecx
+; AVX512F-NEXT: addl %edx, %ecx
+; AVX512F-NEXT: popcntq 56(%rdi), %rax
+; AVX512F-NEXT: popcntq 48(%rdi), %rdx
+; AVX512F-NEXT: popcntq 40(%rdi), %rsi
+; AVX512F-NEXT: popcntq 32(%rdi), %r8
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: addl %esi, %r8d
+; AVX512F-NEXT: popcntq 24(%rdi), %rax
+; AVX512F-NEXT: addl %edx, %r8d
+; AVX512F-NEXT: popcntq 16(%rdi), %rdx
+; AVX512F-NEXT: addl %eax, %edx
+; AVX512F-NEXT: popcntq 8(%rdi), %rsi
+; AVX512F-NEXT: popcntq (%rdi), %rax
+; AVX512F-NEXT: addl %esi, %eax
+; AVX512F-NEXT: addl %edx, %eax
+; AVX512F-NEXT: addl %r8d, %eax
+; AVX512F-NEXT: addl %ecx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctpop_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: popcntq 120(%rdi), %rax
+; AVX512VL-NEXT: popcntq 112(%rdi), %rcx
+; AVX512VL-NEXT: addl %eax, %ecx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq 104(%rdi), %rax
+; AVX512VL-NEXT: popcntq 96(%rdi), %rdx
+; AVX512VL-NEXT: addl %eax, %edx
+; AVX512VL-NEXT: addl %ecx, %edx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq 88(%rdi), %rax
+; AVX512VL-NEXT: popcntq 80(%rdi), %rsi
+; AVX512VL-NEXT: popcntq 72(%rdi), %r8
+; AVX512VL-NEXT: addl %eax, %esi
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: popcntq 64(%rdi), %rcx
+; AVX512VL-NEXT: addl %r8d, %ecx
+; AVX512VL-NEXT: addl %esi, %ecx
+; AVX512VL-NEXT: addl %edx, %ecx
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq 56(%rdi), %rax
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: popcntq 48(%rdi), %rdx
+; AVX512VL-NEXT: xorl %esi, %esi
+; AVX512VL-NEXT: popcntq 40(%rdi), %rsi
+; AVX512VL-NEXT: addl %eax, %edx
+; AVX512VL-NEXT: xorl %r8d, %r8d
+; AVX512VL-NEXT: popcntq 32(%rdi), %r8
+; AVX512VL-NEXT: addl %esi, %r8d
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq 24(%rdi), %rax
+; AVX512VL-NEXT: addl %edx, %r8d
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: popcntq 16(%rdi), %rdx
+; AVX512VL-NEXT: addl %eax, %edx
+; AVX512VL-NEXT: xorl %esi, %esi
+; AVX512VL-NEXT: popcntq 8(%rdi), %rsi
+; AVX512VL-NEXT: xorl %eax, %eax
+; AVX512VL-NEXT: popcntq (%rdi), %rax
+; AVX512VL-NEXT: addl %esi, %eax
+; AVX512VL-NEXT: addl %edx, %eax
+; AVX512VL-NEXT: addl %r8d, %eax
+; AVX512VL-NEXT: addl %ecx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctpop_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: popcntq 120(%rdi), %rax
+; AVX512POPCNT-NEXT: popcntq 112(%rdi), %rcx
+; AVX512POPCNT-NEXT: addl %eax, %ecx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq 104(%rdi), %rax
+; AVX512POPCNT-NEXT: popcntq 96(%rdi), %rdx
+; AVX512POPCNT-NEXT: addl %eax, %edx
+; AVX512POPCNT-NEXT: addl %ecx, %edx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq 88(%rdi), %rax
+; AVX512POPCNT-NEXT: popcntq 80(%rdi), %rsi
+; AVX512POPCNT-NEXT: popcntq 72(%rdi), %r8
+; AVX512POPCNT-NEXT: addl %eax, %esi
+; AVX512POPCNT-NEXT: xorl %ecx, %ecx
+; AVX512POPCNT-NEXT: popcntq 64(%rdi), %rcx
+; AVX512POPCNT-NEXT: addl %r8d, %ecx
+; AVX512POPCNT-NEXT: addl %esi, %ecx
+; AVX512POPCNT-NEXT: addl %edx, %ecx
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq 56(%rdi), %rax
+; AVX512POPCNT-NEXT: xorl %edx, %edx
+; AVX512POPCNT-NEXT: popcntq 48(%rdi), %rdx
+; AVX512POPCNT-NEXT: xorl %esi, %esi
+; AVX512POPCNT-NEXT: popcntq 40(%rdi), %rsi
+; AVX512POPCNT-NEXT: addl %eax, %edx
+; AVX512POPCNT-NEXT: xorl %r8d, %r8d
+; AVX512POPCNT-NEXT: popcntq 32(%rdi), %r8
+; AVX512POPCNT-NEXT: addl %esi, %r8d
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq 24(%rdi), %rax
+; AVX512POPCNT-NEXT: addl %edx, %r8d
+; AVX512POPCNT-NEXT: xorl %edx, %edx
+; AVX512POPCNT-NEXT: popcntq 16(%rdi), %rdx
+; AVX512POPCNT-NEXT: addl %eax, %edx
+; AVX512POPCNT-NEXT: xorl %esi, %esi
+; AVX512POPCNT-NEXT: popcntq 8(%rdi), %rsi
+; AVX512POPCNT-NEXT: xorl %eax, %eax
+; AVX512POPCNT-NEXT: popcntq (%rdi), %rax
+; AVX512POPCNT-NEXT: addl %esi, %eax
+; AVX512POPCNT-NEXT: addl %edx, %eax
+; AVX512POPCNT-NEXT: addl %r8d, %eax
+; AVX512POPCNT-NEXT: addl %ecx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
%a0 = load i1024, ptr %p0
%cnt = call i1024 @llvm.ctpop.i1024(i1024 %a0)
%res = trunc i1024 %cnt to i32
@@ -596,6 +1287,75 @@ define i32 @load_ctlz_i128(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_ctlz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rcx
+; SSE-NEXT: pextrq $1, %xmm0, %rdx
+; SSE-NEXT: bsrq %rdx, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: movl $127, %eax
+; SSE-NEXT: bsrq %rcx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: lzcntq %rcx, %rdx
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_i128:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT: lzcntq %rcx, %rdx
+; AVX512F-NEXT: lzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_i128:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: lzcntq %rcx, %rdx
+; AVX512VL-NEXT: lzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i128:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT: lzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctlz_i256(i256 %a0) nounwind {
; SSE-LABEL: test_ctlz_i256:
; SSE: # %bb.0:
@@ -710,32 +1470,177 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind {
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_ctlz_i256:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq 8(%rdi), %rcx
-; AVX512-NEXT: movq 16(%rdi), %rdx
-; AVX512-NEXT: movq 24(%rdi), %rsi
-; AVX512-NEXT: lzcntq %rsi, %rax
-; AVX512-NEXT: lzcntq %rdx, %r8
-; AVX512-NEXT: addl $64, %r8d
-; AVX512-NEXT: testq %rsi, %rsi
-; AVX512-NEXT: cmovnel %eax, %r8d
-; AVX512-NEXT: lzcntq %rcx, %r9
-; AVX512-NEXT: lzcntq (%rdi), %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %r9d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %rsi, %rdx
-; AVX512-NEXT: cmovnel %r8d, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_ctlz_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = mem[3,2,1,0]
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm0 {%k1}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctlz_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512VL-NEXT: vplzcntq %ymm0, %ymm1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512VL-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512POPCNT-NEXT: vplzcntq %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%a0 = load i256, ptr %p0
%cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
%res = trunc i256 %cnt to i32
ret i32 %res
}
+define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rcx
+; SSE-NEXT: pextrq $1, %xmm0, %rdx
+; SSE-NEXT: movq %xmm1, %rax
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: bsrq %rsi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: bsrq %rax, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %r8d
+; SSE-NEXT: bsrq %rdx, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: movl $127, %eax
+; SSE-NEXT: bsrq %rcx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: lzcntq %rsi, %rdi
+; AVX2-NEXT: lzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: lzcntq %rcx, %rdi
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vmovq %xmm0, %rdx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT: lzcntq %rsi, %rdi
+; AVX512F-NEXT: lzcntq %rdx, %r8
+; AVX512F-NEXT: addl $64, %r8d
+; AVX512F-NEXT: testq %rsi, %rsi
+; AVX512F-NEXT: cmovnel %edi, %r8d
+; AVX512F-NEXT: lzcntq %rcx, %rdi
+; AVX512F-NEXT: lzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edi, %eax
+; AVX512F-NEXT: subl $-128, %eax
+; AVX512F-NEXT: orq %rsi, %rdx
+; AVX512F-NEXT: cmovnel %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq %xmm0, %rdx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT: lzcntq %rsi, %rdi
+; AVX512VL-NEXT: lzcntq %rdx, %r8
+; AVX512VL-NEXT: addl $64, %r8d
+; AVX512VL-NEXT: testq %rsi, %rsi
+; AVX512VL-NEXT: cmovnel %edi, %r8d
+; AVX512VL-NEXT: lzcntq %rcx, %rdi
+; AVX512VL-NEXT: lzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edi, %eax
+; AVX512VL-NEXT: subl $-128, %eax
+; AVX512VL-NEXT: orq %rsi, %rdx
+; AVX512VL-NEXT: cmovnel %r8d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT: lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT: addl $64, %r8d
+; AVX512POPCNT-NEXT: testq %rsi, %rsi
+; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT: lzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edi, %eax
+; AVX512POPCNT-NEXT: subl $-128, %eax
+; AVX512POPCNT-NEXT: orq %rsi, %rdx
+; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <8 x i32> %v0 to i256
+ %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctlz_i512(i512 %a0) nounwind {
; SSE-LABEL: test_ctlz_i512:
; SSE: # %bb.0:
@@ -843,50 +1748,76 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_ctlz_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: lzcntq %r11, %rax
-; AVX512-NEXT: lzcntq %r10, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %r11, %r11
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: lzcntq %r9, %rax
-; AVX512-NEXT: lzcntq %r8, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %r9, %r9
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: subl $-128, %ebx
-; AVX512-NEXT: movq %r10, %rax
-; AVX512-NEXT: orq %r11, %rax
-; AVX512-NEXT: cmovnel %r14d, %ebx
-; AVX512-NEXT: lzcntq %rcx, %rax
-; AVX512-NEXT: lzcntq %rdx, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: lzcntq %rsi, %r15
-; AVX512-NEXT: lzcntq %rdi, %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %rsi, %rsi
-; AVX512-NEXT: cmovnel %r15d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %rcx, %rdx
-; AVX512-NEXT: cmovnel %r14d, %eax
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r11, %r9
-; AVX512-NEXT: orq %r10, %r8
-; AVX512-NEXT: orq %r9, %r8
-; AVX512-NEXT: cmovnel %ebx, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_ctlz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %rdi, %xmm0
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vmovq %rcx, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vmovq %r9, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_ctlz_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovq %rdi, %xmm0
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vmovq %rcx, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vmovq %r9, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_ctlz_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm3
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
%res = trunc i512 %cnt to i32
ret i32 %res
@@ -1008,59 +1939,194 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_ctlz_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq 8(%rdi), %r11
-; AVX512-NEXT: movq 16(%rdi), %r9
-; AVX512-NEXT: movq 24(%rdi), %r10
-; AVX512-NEXT: movq 32(%rdi), %rcx
-; AVX512-NEXT: movq 40(%rdi), %rdx
-; AVX512-NEXT: movq 48(%rdi), %rsi
-; AVX512-NEXT: movq 56(%rdi), %r8
-; AVX512-NEXT: lzcntq %r8, %rax
-; AVX512-NEXT: lzcntq %rsi, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %r8, %r8
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: lzcntq %rdx, %rax
-; AVX512-NEXT: lzcntq %rcx, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: subl $-128, %ebx
-; AVX512-NEXT: movq %rsi, %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: cmovnel %r14d, %ebx
-; AVX512-NEXT: lzcntq %r10, %rax
-; AVX512-NEXT: lzcntq %r9, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %r10, %r10
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: lzcntq (%rdi), %rax
-; AVX512-NEXT: lzcntq %r11, %rdi
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %r11, %r11
-; AVX512-NEXT: cmovnel %edi, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %r10, %r9
-; AVX512-NEXT: cmovnel %r14d, %eax
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r8, %rdx
-; AVX512-NEXT: orq %rsi, %rcx
-; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: cmovnel %ebx, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_ctlz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctlz_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%a0 = load i512, ptr %p0
%cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
%res = trunc i512 %cnt to i32
ret i32 %res
}
+define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rdx
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: pextrq $1, %xmm1, %rax
+; SSE-NEXT: pextrq $1, %xmm2, %rdi
+; SSE-NEXT: movq %xmm2, %rsi
+; SSE-NEXT: movq %xmm3, %r8
+; SSE-NEXT: pextrq $1, %xmm3, %r9
+; SSE-NEXT: bsrq %r9, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: bsrq %r8, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r10d, %r8d
+; SSE-NEXT: bsrq %rdi, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: bsrq %rsi, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %r9d, %esi
+; SSE-NEXT: movq %xmm1, %rdi
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: ptest %xmm3, %xmm3
+; SSE-NEXT: cmovnel %r8d, %esi
+; SSE-NEXT: bsrq %rax, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: bsrq %rdi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: orl $64, %edi
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %r8d, %edi
+; SSE-NEXT: bsrq %rcx, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: movl $127, %eax
+; SSE-NEXT: bsrq %rdx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por %xmm3, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT: vmovq %xmm2, %rdx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vmovq %xmm2, %r8
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: vmovq %xmm1, %rdi
+; AVX2-NEXT: vpextrq $1, %xmm1, %r9
+; AVX2-NEXT: lzcntq %rax, %r10
+; AVX2-NEXT: lzcntq %r8, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r10d, %r11d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq %r9, %r10
+; AVX2-NEXT: lzcntq %rdi, %rdi
+; AVX2-NEXT: addl $64, %edi
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %edi
+; AVX2-NEXT: subl $-128, %edi
+; AVX2-NEXT: orq %rax, %r8
+; AVX2-NEXT: cmovnel %r11d, %edi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rcx, %rax
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: lzcntq %rsi, %r9
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vptest %ymm1, %ymm1
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <16 x i32> %v0 to i512
+ %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_ctlz_i1024(i1024 %a0) nounwind {
; SSE-LABEL: test_ctlz_i1024:
; SSE: # %bb.0:
@@ -1312,116 +2378,151 @@ define i32 @test_ctlz_i1024(i1024 %a0) nounwind {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_ctlz_i1024:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq %r9, %r14
-; AVX512-NEXT: movq %r8, %r11
-; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512-NEXT: lzcntq %r12, %rcx
-; AVX512-NEXT: lzcntq %r8, %r9
-; AVX512-NEXT: addl $64, %r9d
-; AVX512-NEXT: testq %r12, %r12
-; AVX512-NEXT: cmovnel %ecx, %r9d
-; AVX512-NEXT: lzcntq %r10, %rsi
-; AVX512-NEXT: lzcntq %rax, %rcx
-; AVX512-NEXT: addl $64, %ecx
-; AVX512-NEXT: testq %r10, %r10
-; AVX512-NEXT: cmovnel %esi, %ecx
-; AVX512-NEXT: subl $-128, %ecx
-; AVX512-NEXT: movq %r8, %rsi
-; AVX512-NEXT: orq %r12, %rsi
-; AVX512-NEXT: cmovnel %r9d, %ecx
-; AVX512-NEXT: lzcntq %rbx, %rdi
-; AVX512-NEXT: lzcntq %r15, %rsi
-; AVX512-NEXT: addl $64, %esi
-; AVX512-NEXT: testq %rbx, %rbx
-; AVX512-NEXT: cmovnel %edi, %esi
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX512-NEXT: lzcntq %r13, %rbp
-; AVX512-NEXT: addl $64, %ebp
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; AVX512-NEXT: lzcntq %r9, %rdi
-; AVX512-NEXT: testq %r9, %r9
-; AVX512-NEXT: cmovnel %edi, %ebp
-; AVX512-NEXT: subl $-128, %ebp
-; AVX512-NEXT: movq %r15, %rdi
-; AVX512-NEXT: orq %rbx, %rdi
-; AVX512-NEXT: cmovnel %esi, %ebp
-; AVX512-NEXT: addl $256, %ebp # imm = 0x100
-; AVX512-NEXT: movq %r10, %rdi
-; AVX512-NEXT: orq %r12, %rdi
-; AVX512-NEXT: movq %rax, %rsi
-; AVX512-NEXT: orq %r8, %rsi
-; AVX512-NEXT: orq %rdi, %rsi
-; AVX512-NEXT: cmovnel %ecx, %ebp
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT: lzcntq %rdi, %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512-NEXT: lzcntq %r12, %rcx
-; AVX512-NEXT: testq %r12, %r12
-; AVX512-NEXT: cmovnel %ecx, %eax
-; AVX512-NEXT: lzcntq %r11, %rcx
-; AVX512-NEXT: addl $64, %ecx
-; AVX512-NEXT: lzcntq %r14, %rsi
-; AVX512-NEXT: testq %r14, %r14
-; AVX512-NEXT: cmovnel %esi, %ecx
-; AVX512-NEXT: subl $-128, %ecx
-; AVX512-NEXT: movq %rdi, %rsi
-; AVX512-NEXT: orq %r12, %rsi
-; AVX512-NEXT: cmovnel %eax, %ecx
-; AVX512-NEXT: movq %rdx, %rdi
-; AVX512-NEXT: lzcntq %rdx, %rdx
-; AVX512-NEXT: addl $64, %edx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: lzcntq %r10, %rax
-; AVX512-NEXT: testq %r10, %r10
-; AVX512-NEXT: cmovnel %eax, %edx
-; AVX512-NEXT: lzcntq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: lzcntq %rsi, %r8
-; AVX512-NEXT: testq %rsi, %rsi
-; AVX512-NEXT: cmovnel %r8d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %r10, %rdi
-; AVX512-NEXT: cmovnel %edx, %eax
-; AVX512-NEXT: orq %r12, %r14
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r14, %r11
-; AVX512-NEXT: cmovnel %ecx, %eax
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9
-; AVX512-NEXT: orq %rbx, %r9
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r15
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13
-; AVX512-NEXT: orq %r15, %r13
-; AVX512-NEXT: addl $512, %eax # imm = 0x200
-; AVX512-NEXT: orq %r9, %r13
-; AVX512-NEXT: cmovnel %ebp, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_ctlz_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT: vmovq %rdi, %xmm0
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vmovq %rcx, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vmovq %r8, %xmm1
+; AVX512F-NEXT: vmovq %r9, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovd %xmm0, %ecx
+; AVX512F-NEXT: addl $512, %ecx # imm = 0x200
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: orq %r14, %r11
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: orq %rbx, %r10
+; AVX512F-NEXT: orq %r11, %r10
+; AVX512F-NEXT: cmovel %ecx, %eax
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_ctlz_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %r14
+; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512VL-NEXT: vmovq %rdi, %xmm0
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vmovq %rcx, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vmovq %r9, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %ecx
+; AVX512VL-NEXT: addl $512, %ecx # imm = 0x200
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r14
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: orq %r14, %r11
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: orq %rbx, %r10
+; AVX512VL-NEXT: orq %r11, %r10
+; AVX512VL-NEXT: cmovel %ecx, %eax
+; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_ctlz_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: pushq %r14
+; AVX512POPCNT-NEXT: pushq %rbx
+; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm3
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %ecx
+; AVX512POPCNT-NEXT: addl $512, %ecx # imm = 0x200
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r14
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rbx
+; AVX512POPCNT-NEXT: orq %r14, %r11
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; AVX512POPCNT-NEXT: orq %rbx, %r10
+; AVX512POPCNT-NEXT: orq %r11, %r10
+; AVX512POPCNT-NEXT: cmovel %ecx, %eax
+; AVX512POPCNT-NEXT: popq %rbx
+; AVX512POPCNT-NEXT: popq %r14
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0)
%res = trunc i1024 %cnt to i32
ret i32 %res
@@ -1687,121 +2788,1768 @@ define i32 @load_ctlz_i1024(ptr %p0) nounwind {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_ctlz_i1024:
+; AVX512F-LABEL: load_ctlz_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq 80(%rdi), %rsi
+; AVX512F-NEXT: movq 64(%rdi), %rcx
+; AVX512F-NEXT: movq 72(%rdi), %rdx
+; AVX512F-NEXT: movq 88(%rdi), %r8
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm1, %r9d
+; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: orq 120(%rdi), %r8
+; AVX512F-NEXT: addl $512, %eax # imm = 0x200
+; AVX512F-NEXT: orq 104(%rdi), %rdx
+; AVX512F-NEXT: orq %r8, %rdx
+; AVX512F-NEXT: orq 112(%rdi), %rsi
+; AVX512F-NEXT: orq 96(%rdi), %rcx
+; AVX512F-NEXT: orq %rsi, %rcx
+; AVX512F-NEXT: orq %rdx, %rcx
+; AVX512F-NEXT: cmovnel %r9d, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctlz_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movq 80(%rdi), %rsi
+; AVX512VL-NEXT: movq 64(%rdi), %rcx
+; AVX512VL-NEXT: movq 72(%rdi), %rdx
+; AVX512VL-NEXT: movq 88(%rdi), %r8
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm1, %r9d
+; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
+; AVX512VL-NEXT: orq 120(%rdi), %r8
+; AVX512VL-NEXT: orq 104(%rdi), %rdx
+; AVX512VL-NEXT: orq 112(%rdi), %rsi
+; AVX512VL-NEXT: orq %r8, %rdx
+; AVX512VL-NEXT: orq 96(%rdi), %rcx
+; AVX512VL-NEXT: orq %rsi, %rcx
+; AVX512VL-NEXT: orq %rdx, %rcx
+; AVX512VL-NEXT: cmovnel %r9d, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: movq 80(%rdi), %rsi
+; AVX512POPCNT-NEXT: movq 64(%rdi), %rcx
+; AVX512POPCNT-NEXT: movq 72(%rdi), %rdx
+; AVX512POPCNT-NEXT: movq 88(%rdi), %r8
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %r9d
+; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
+; AVX512POPCNT-NEXT: orq 120(%rdi), %r8
+; AVX512POPCNT-NEXT: orq 104(%rdi), %rdx
+; AVX512POPCNT-NEXT: orq 112(%rdi), %rsi
+; AVX512POPCNT-NEXT: orq %r8, %rdx
+; AVX512POPCNT-NEXT: orq 96(%rdi), %rcx
+; AVX512POPCNT-NEXT: orq %rsi, %rcx
+; AVX512POPCNT-NEXT: orq %rdx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %r9d, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = load i1024, ptr %p0
+ %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0)
+ %res = trunc i1024 %cnt to i32
+ ret i32 %res
+}
+
+;
+; CTLZ_ZERO_UNDEF
+;
+
+define i32 @test_ctlz_undef_i128(i128 %a0) nounwind {
+; SSE-LABEL: test_ctlz_undef_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: bsrq %rsi, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: bsrq %rdi, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ctlz_undef_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: lzcntq %rsi, %rcx
+; AVX2-NEXT: lzcntq %rdi, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %ecx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_ctlz_undef_i128:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq 32(%rdi), %r14
-; AVX512-NEXT: movq 48(%rdi), %rbp
-; AVX512-NEXT: movq 64(%rdi), %r11
-; AVX512-NEXT: movq 72(%rdi), %r10
-; AVX512-NEXT: movq 80(%rdi), %rdx
-; AVX512-NEXT: movq 88(%rdi), %rbx
-; AVX512-NEXT: movq 96(%rdi), %rsi
-; AVX512-NEXT: movq 104(%rdi), %r9
-; AVX512-NEXT: movq 112(%rdi), %r8
-; AVX512-NEXT: movq 120(%rdi), %r15
-; AVX512-NEXT: lzcntq %r15, %rax
-; AVX512-NEXT: lzcntq %r8, %rcx
-; AVX512-NEXT: addl $64, %ecx
-; AVX512-NEXT: testq %r15, %r15
-; AVX512-NEXT: cmovnel %eax, %ecx
-; AVX512-NEXT: lzcntq %r9, %r12
-; AVX512-NEXT: lzcntq %rsi, %rax
-; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %r9, %r9
-; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: cmovnel %r12d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: movq %r8, %r12
-; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: orq %r15, %r12
-; AVX512-NEXT: cmovnel %ecx, %eax
-; AVX512-NEXT: lzcntq %rbx, %rcx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: lzcntq %rdx, %r13
-; AVX512-NEXT: addl $64, %r13d
-; AVX512-NEXT: testq %rbx, %rbx
-; AVX512-NEXT: cmovnel %ecx, %r13d
-; AVX512-NEXT: lzcntq %r10, %rcx
-; AVX512-NEXT: lzcntq %r11, %r12
-; AVX512-NEXT: addl $64, %r12d
-; AVX512-NEXT: testq %r10, %r10
-; AVX512-NEXT: cmovnel %ecx, %r12d
-; AVX512-NEXT: subl $-128, %r12d
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: orq %rbx, %rcx
-; AVX512-NEXT: cmovnel %r13d, %r12d
-; AVX512-NEXT: addl $256, %r12d # imm = 0x100
-; AVX512-NEXT: movq %r9, %rcx
-; AVX512-NEXT: orq %r15, %rcx
-; AVX512-NEXT: orq %r8, %rsi
-; AVX512-NEXT: orq %rcx, %rsi
-; AVX512-NEXT: movq 56(%rdi), %r13
-; AVX512-NEXT: cmovnel %eax, %r12d
-; AVX512-NEXT: lzcntq %r13, %rcx
-; AVX512-NEXT: movq %rbp, %rsi
-; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: lzcntq %rbp, %rax
+; AVX512-NEXT: lzcntq %rsi, %rcx
+; AVX512-NEXT: lzcntq %rdi, %rax
; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %r13, %r13
+; AVX512-NEXT: testq %rsi, %rsi
; AVX512-NEXT: cmovnel %ecx, %eax
-; AVX512-NEXT: lzcntq %r14, %rbp
-; AVX512-NEXT: addl $64, %ebp
-; AVX512-NEXT: movq 40(%rdi), %r8
-; AVX512-NEXT: lzcntq %r8, %rdx
-; AVX512-NEXT: testq %r8, %r8
-; AVX512-NEXT: cmovnel %edx, %ebp
-; AVX512-NEXT: subl $-128, %ebp
-; AVX512-NEXT: movq %rsi, %rdx
-; AVX512-NEXT: orq %r13, %rdx
-; AVX512-NEXT: cmovnel %eax, %ebp
-; AVX512-NEXT: movq 16(%rdi), %r9
-; AVX512-NEXT: lzcntq %r9, %rcx
-; AVX512-NEXT: addl $64, %ecx
-; AVX512-NEXT: movq 24(%rdi), %rdx
-; AVX512-NEXT: lzcntq %rdx, %rax
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %ecx
-; AVX512-NEXT: movq 8(%rdi), %rsi
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
+ %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @load_ctlz_undef_i128(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_undef_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: bsrq %rcx, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: bsrq (%rdi), %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: load_ctlz_undef_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq 8(%rdi), %rcx
+; AVX2-NEXT: lzcntq %rcx, %rdx
+; AVX2-NEXT: lzcntq (%rdi), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_ctlz_undef_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq 8(%rdi), %rcx
+; AVX512-NEXT: lzcntq %rcx, %rdx
; AVX512-NEXT: lzcntq (%rdi), %rax
; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: lzcntq %rsi, %rdi
+; AVX512-NEXT: testq %rcx, %rcx
+; AVX512-NEXT: cmovnel %edx, %eax
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
+ %a0 = load i128, ptr %p0
+ %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: bsrq %rcx, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: bsrq %rax, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: lzcntq %rcx, %rdx
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i128:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT: lzcntq %rcx, %rdx
+; AVX512F-NEXT: lzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edx, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i128:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: lzcntq %rcx, %rdx
+; AVX512VL-NEXT: lzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edx, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i128:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT: lzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edx, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @test_ctlz_undef_i256(i256 %a0) nounwind {
+; SSE-LABEL: test_ctlz_undef_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: bsrq %rcx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %rdx, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %eax, %r8d
+; SSE-NEXT: bsrq %rsi, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: bsrq %rdi, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %r9d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ctlz_undef_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: lzcntq %rcx, %rax
+; AVX2-NEXT: lzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: lzcntq %rsi, %r9
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rdi, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_ctlz_undef_i256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: lzcntq %rcx, %rax
+; AVX512-NEXT: lzcntq %rdx, %r8
+; AVX512-NEXT: addl $64, %r8d
+; AVX512-NEXT: testq %rcx, %rcx
+; AVX512-NEXT: cmovnel %eax, %r8d
+; AVX512-NEXT: lzcntq %rsi, %r9
+; AVX512-NEXT: lzcntq %rdi, %rax
+; AVX512-NEXT: addl $64, %eax
; AVX512-NEXT: testq %rsi, %rsi
-; AVX512-NEXT: cmovnel %edi, %eax
+; AVX512-NEXT: cmovnel %r9d, %eax
; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %rdx, %r9
-; AVX512-NEXT: cmovnel %ecx, %eax
-; AVX512-NEXT: orq %r13, %r8
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r8, %r14
-; AVX512-NEXT: cmovnel %ebp, %eax
-; AVX512-NEXT: orq %r15, %rbx
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; AVX512-NEXT: orq %rbx, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX512-NEXT: orq %rcx, %r11
-; AVX512-NEXT: addl $512, %eax # imm = 0x200
-; AVX512-NEXT: orq %r10, %r11
-; AVX512-NEXT: cmovnel %r12d, %eax
+; AVX512-NEXT: orq %rcx, %rdx
+; AVX512-NEXT: cmovnel %r8d, %eax
; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: retq
+ %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @load_ctlz_undef_i256(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_undef_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq 16(%rdi), %rcx
+; SSE-NEXT: movq 24(%rdi), %rsi
+; SSE-NEXT: bsrq %rsi, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %rcx, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %eax, %r8d
+; SSE-NEXT: bsrq %rdx, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: bsrq (%rdi), %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %r9d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %rsi, %rcx
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: load_ctlz_undef_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq 16(%rdi), %rcx
+; AVX2-NEXT: movq 24(%rdi), %rdx
+; AVX2-NEXT: lzcntq %rdx, %rax
+; AVX2-NEXT: lzcntq %rcx, %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %eax, %esi
+; AVX2-NEXT: movq 8(%rdi), %r8
+; AVX2-NEXT: lzcntq %r8, %r9
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq (%rdi), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: load_ctlz_undef_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctlz_undef_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vplzcntq %ymm0, %ymm0
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: vpcompressq %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_undef_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpermq {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vplzcntq %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vpcompressq %ymm0, %ymm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = load i256, ptr %p0
+ %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: movq %xmm0, %rax
+; SSE-NEXT: movq %xmm1, %rdx
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: bsrq %rsi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: bsrq %rdx, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: orl $64, %edx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %edx
+; SSE-NEXT: bsrq %rcx, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: bsrq %rax, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: lzcntq %rsi, %rdi
+; AVX2-NEXT: lzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: lzcntq %rcx, %rdi
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vmovq %xmm0, %rdx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT: lzcntq %rsi, %rdi
+; AVX512F-NEXT: lzcntq %rdx, %r8
+; AVX512F-NEXT: addl $64, %r8d
+; AVX512F-NEXT: testq %rsi, %rsi
+; AVX512F-NEXT: cmovnel %edi, %r8d
+; AVX512F-NEXT: lzcntq %rcx, %rdi
+; AVX512F-NEXT: lzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edi, %eax
+; AVX512F-NEXT: subl $-128, %eax
+; AVX512F-NEXT: orq %rsi, %rdx
+; AVX512F-NEXT: cmovnel %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vmovq %xmm0, %rdx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT: lzcntq %rsi, %rdi
+; AVX512VL-NEXT: lzcntq %rdx, %r8
+; AVX512VL-NEXT: addl $64, %r8d
+; AVX512VL-NEXT: testq %rsi, %rsi
+; AVX512VL-NEXT: cmovnel %edi, %r8d
+; AVX512VL-NEXT: lzcntq %rcx, %rdi
+; AVX512VL-NEXT: lzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edi, %eax
+; AVX512VL-NEXT: subl $-128, %eax
+; AVX512VL-NEXT: orq %rsi, %rdx
+; AVX512VL-NEXT: cmovnel %r8d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT: lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT: lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT: addl $64, %r8d
+; AVX512POPCNT-NEXT: testq %rsi, %rsi
+; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT: lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT: lzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edi, %eax
+; AVX512POPCNT-NEXT: subl $-128, %eax
+; AVX512POPCNT-NEXT: orq %rsi, %rdx
+; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <8 x i32> %v0 to i256
+ %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
+; SSE-LABEL: test_ctlz_undef_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT: bsrq %r11, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %r10, %r14
+; SSE-NEXT: xorl $63, %r14d
+; SSE-NEXT: orl $64, %r14d
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovnel %eax, %r14d
+; SSE-NEXT: bsrq %r9, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %r8, %rbx
+; SSE-NEXT: xorl $63, %ebx
+; SSE-NEXT: orl $64, %ebx
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %eax, %ebx
+; SSE-NEXT: subl $-128, %ebx
+; SSE-NEXT: movq %r10, %rax
+; SSE-NEXT: orq %r11, %rax
+; SSE-NEXT: cmovnel %r14d, %ebx
+; SSE-NEXT: bsrq %rcx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %rdx, %r14
+; SSE-NEXT: xorl $63, %r14d
+; SSE-NEXT: orl $64, %r14d
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %eax, %r14d
+; SSE-NEXT: bsrq %rsi, %r15
+; SSE-NEXT: xorl $63, %r15d
+; SSE-NEXT: bsrq %rdi, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: cmovnel %r14d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: orq %r11, %r9
+; SSE-NEXT: orq %r10, %r8
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: cmovnel %ebx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ctlz_undef_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: lzcntq %r11, %rax
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: lzcntq %r10, %r14
+; AVX2-NEXT: addl $64, %r14d
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %eax, %r14d
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %r9, %rax
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: lzcntq %r8, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: subl $-128, %ebx
+; AVX2-NEXT: movq %r10, %rax
+; AVX2-NEXT: orq %r11, %rax
+; AVX2-NEXT: cmovnel %r14d, %ebx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rcx, %rax
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: lzcntq %rdx, %r14
+; AVX2-NEXT: addl $64, %r14d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %r14d
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: lzcntq %rsi, %r15
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rdi, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r14d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %r11, %r9
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %ebx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_ctlz_undef_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %rdi, %xmm0
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vmovq %rcx, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vmovq %r8, %xmm1
+; AVX512F-NEXT: vmovq %r9, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_ctlz_undef_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovq %rdi, %xmm0
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vmovq %rcx, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vmovq %r9, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_ctlz_undef_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm3
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @load_ctlz_undef_i512(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_undef_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq 8(%rdi), %r11
+; SSE-NEXT: movq 16(%rdi), %r9
+; SSE-NEXT: movq 24(%rdi), %r10
+; SSE-NEXT: movq 32(%rdi), %rcx
+; SSE-NEXT: movq 40(%rdi), %rdx
+; SSE-NEXT: movq 48(%rdi), %rsi
+; SSE-NEXT: movq 56(%rdi), %r8
+; SSE-NEXT: bsrq %r8, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %rsi, %r14
+; SSE-NEXT: xorl $63, %r14d
+; SSE-NEXT: orl $64, %r14d
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %eax, %r14d
+; SSE-NEXT: bsrq %rdx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %rcx, %rbx
+; SSE-NEXT: xorl $63, %ebx
+; SSE-NEXT: orl $64, %ebx
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %eax, %ebx
+; SSE-NEXT: subl $-128, %ebx
+; SSE-NEXT: movq %rsi, %rax
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: cmovnel %r14d, %ebx
+; SSE-NEXT: bsrq %r10, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %r9, %r14
+; SSE-NEXT: xorl $63, %r14d
+; SSE-NEXT: orl $64, %r14d
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %eax, %r14d
+; SSE-NEXT: bsrq %r11, %r15
+; SSE-NEXT: xorl $63, %r15d
+; SSE-NEXT: bsrq (%rdi), %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %r10, %r9
+; SSE-NEXT: cmovnel %r14d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: orq %r8, %rdx
+; SSE-NEXT: orq %rsi, %rcx
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: cmovnel %ebx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: load_ctlz_undef_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq 8(%rdi), %r10
+; AVX2-NEXT: movq 16(%rdi), %r9
+; AVX2-NEXT: movq 32(%rdi), %rcx
+; AVX2-NEXT: movq 40(%rdi), %rdx
+; AVX2-NEXT: movq 48(%rdi), %rsi
+; AVX2-NEXT: movq 56(%rdi), %r8
+; AVX2-NEXT: lzcntq %r8, %rax
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: lzcntq %rsi, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rdx, %rax
+; AVX2-NEXT: lzcntq %rcx, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %eax, %r11d
+; AVX2-NEXT: subl $-128, %r11d
+; AVX2-NEXT: movq %rsi, %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: cmovnel %ebx, %r11d
+; AVX2-NEXT: movq 24(%rdi), %rbx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rbx, %rax
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: lzcntq %r9, %r14
+; AVX2-NEXT: addl $64, %r14d
+; AVX2-NEXT: testq %rbx, %rbx
+; AVX2-NEXT: cmovnel %eax, %r14d
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: lzcntq %r10, %r15
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq (%rdi), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rbx, %r9
+; AVX2-NEXT: cmovnel %r14d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %r8, %rdx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: load_ctlz_undef_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctlz_undef_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_undef_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = load i512, ptr %p0
+ %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: pextrq $1, %xmm1, %rax
+; SSE-NEXT: pextrq $1, %xmm2, %rsi
+; SSE-NEXT: movq %xmm2, %rdx
+; SSE-NEXT: movq %xmm3, %rdi
+; SSE-NEXT: pextrq $1, %xmm3, %r8
+; SSE-NEXT: bsrq %r8, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: bsrq %rdi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: orl $64, %edi
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %r9d, %edi
+; SSE-NEXT: bsrq %rsi, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: bsrq %rdx, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: orl $64, %edx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %r8d, %edx
+; SSE-NEXT: movq %xmm0, %rsi
+; SSE-NEXT: subl $-128, %edx
+; SSE-NEXT: ptest %xmm3, %xmm3
+; SSE-NEXT: movq %xmm1, %r8
+; SSE-NEXT: cmovnel %edi, %edx
+; SSE-NEXT: bsrq %rax, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: bsrq %r8, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %edi, %r8d
+; SSE-NEXT: bsrq %rcx, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: bsrq %rsi, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por %xmm3, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT: vmovq %xmm2, %rdx
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vmovq %xmm2, %r8
+; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT: vmovq %xmm1, %rdi
+; AVX2-NEXT: vpextrq $1, %xmm1, %r9
+; AVX2-NEXT: lzcntq %rax, %r10
+; AVX2-NEXT: lzcntq %r8, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r10d, %r11d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq %r9, %r10
+; AVX2-NEXT: lzcntq %rdi, %rdi
+; AVX2-NEXT: addl $64, %edi
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %edi
+; AVX2-NEXT: subl $-128, %edi
+; AVX2-NEXT: orq %rax, %r8
+; AVX2-NEXT: cmovnel %r11d, %edi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rcx, %rax
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: lzcntq %rsi, %r9
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vptest %ymm1, %ymm1
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <16 x i32> %v0 to i512
+ %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
+; SSE-LABEL: test_ctlz_undef_i1024:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq %r9, %r12
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT: bsrq %r11, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %rsi, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: orl $64, %ecx
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovnel %eax, %ecx
+; SSE-NEXT: bsrq %rdx, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: bsrq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %r10d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: movq %rsi, %r9
+; SSE-NEXT: movq %rsi, %rbx
+; SSE-NEXT: orq %r11, %r9
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: bsrq %r15, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: bsrq %r13, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
+; SSE-NEXT: testq %r15, %r15
+; SSE-NEXT: cmovnel %ecx, %esi
+; SSE-NEXT: bsrq %r14, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT: bsrq %r9, %rbp
+; SSE-NEXT: xorl $63, %ebp
+; SSE-NEXT: orl $64, %ebp
+; SSE-NEXT: testq %r14, %r14
+; SSE-NEXT: cmovnel %ecx, %ebp
+; SSE-NEXT: movq %r8, %r10
+; SSE-NEXT: subl $-128, %ebp
+; SSE-NEXT: movq %r13, %rcx
+; SSE-NEXT: orq %r15, %rcx
+; SSE-NEXT: cmovnel %esi, %ebp
+; SSE-NEXT: addl $256, %ebp # imm = 0x100
+; SSE-NEXT: orq %r11, %rdx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT: orq %rbx, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: cmovnel %eax, %ebp
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT: bsrq %rdx, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT: bsrq %r8, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: bsrq %r12, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: bsrq %r10, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: orl $64, %ecx
+; SSE-NEXT: testq %r12, %r12
+; SSE-NEXT: cmovnel %esi, %ecx
+; SSE-NEXT: movq %rdi, %rbx
+; SSE-NEXT: subl $-128, %ecx
+; SSE-NEXT: movq %r8, %rsi
+; SSE-NEXT: orq %rdx, %rsi
+; SSE-NEXT: cmovnel %eax, %ecx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: bsrq %r11, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: bsrq %r8, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: orl $64, %edx
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovnel %eax, %edx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; SSE-NEXT: bsrq %rdi, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: bsrq %rbx, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %r11, %r8
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: orq %r12, %r10
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT: orq %r15, %r14
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT: orq %r13, %r9
+; SSE-NEXT: addl $512, %eax # imm = 0x200
+; SSE-NEXT: orq %r14, %r9
+; SSE-NEXT: cmovnel %ebp, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ctlz_undef_i1024:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq %r9, %r14
+; AVX2-NEXT: movq %r8, %r11
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: lzcntq %r12, %rcx
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: lzcntq %r8, %r9
+; AVX2-NEXT: addl $64, %r9d
+; AVX2-NEXT: testq %r12, %r12
+; AVX2-NEXT: cmovnel %ecx, %r9d
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: lzcntq %r10, %rsi
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: lzcntq %rax, %rcx
+; AVX2-NEXT: addl $64, %ecx
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %esi, %ecx
+; AVX2-NEXT: subl $-128, %ecx
+; AVX2-NEXT: movq %r8, %rsi
+; AVX2-NEXT: orq %r12, %rsi
+; AVX2-NEXT: cmovnel %r9d, %ecx
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: lzcntq %rbx, %rdi
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: lzcntq %r15, %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %rbx, %rbx
+; AVX2-NEXT: cmovnel %edi, %esi
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT: xorl %ebp, %ebp
+; AVX2-NEXT: lzcntq %r13, %rbp
+; AVX2-NEXT: addl $64, %ebp
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: lzcntq %r9, %rdi
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %edi, %ebp
+; AVX2-NEXT: subl $-128, %ebp
+; AVX2-NEXT: movq %r15, %rdi
+; AVX2-NEXT: orq %rbx, %rdi
+; AVX2-NEXT: cmovnel %esi, %ebp
+; AVX2-NEXT: addl $256, %ebp # imm = 0x100
+; AVX2-NEXT: movq %r10, %rdi
+; AVX2-NEXT: orq %r12, %rdi
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: orq %rdi, %rsi
+; AVX2-NEXT: cmovnel %ecx, %ebp
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rdi, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: lzcntq %r12, %rcx
+; AVX2-NEXT: testq %r12, %r12
+; AVX2-NEXT: cmovnel %ecx, %eax
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: lzcntq %r11, %rcx
+; AVX2-NEXT: addl $64, %ecx
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: lzcntq %r14, %rsi
+; AVX2-NEXT: testq %r14, %r14
+; AVX2-NEXT: cmovnel %esi, %ecx
+; AVX2-NEXT: subl $-128, %ecx
+; AVX2-NEXT: movq %rdi, %rsi
+; AVX2-NEXT: orq %r12, %rsi
+; AVX2-NEXT: cmovnel %eax, %ecx
+; AVX2-NEXT: movq %rdx, %rdi
+; AVX2-NEXT: lzcntq %rdx, %rdx
+; AVX2-NEXT: addl $64, %edx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %r10, %rax
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %eax, %edx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX2-NEXT: lzcntq %rsi, %r8
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %r10, %rdi
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: orq %r12, %r14
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %r14, %r11
+; AVX2-NEXT: cmovnel %ecx, %eax
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT: orq %rbx, %r9
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT: orq %r15, %r13
+; AVX2-NEXT: addl $512, %eax # imm = 0x200
+; AVX2-NEXT: orq %r9, %r13
+; AVX2-NEXT: cmovnel %ebp, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_ctlz_undef_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT: vmovq %rdi, %xmm0
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vmovq %rcx, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vmovq %r9, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %ecx
+; AVX512F-NEXT: addl $512, %ecx # imm = 0x200
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; AVX512F-NEXT: orq %r14, %r11
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rbx
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: orq %rbx, %r10
+; AVX512F-NEXT: orq %r11, %r10
+; AVX512F-NEXT: cmovel %ecx, %eax
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_ctlz_undef_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %r14
+; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512VL-NEXT: vmovq %rdi, %xmm0
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vmovq %rcx, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vmovq %r9, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %ecx
+; AVX512VL-NEXT: addl $512, %ecx # imm = 0x200
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r14
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: orq %r14, %r11
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: orq %rbx, %r10
+; AVX512VL-NEXT: orq %r11, %r10
+; AVX512VL-NEXT: cmovel %ecx, %eax
+; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_ctlz_undef_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: pushq %r14
+; AVX512POPCNT-NEXT: pushq %rbx
+; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm3
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %ecx
+; AVX512POPCNT-NEXT: addl $512, %ecx # imm = 0x200
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r14
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rbx
+; AVX512POPCNT-NEXT: orq %r14, %r11
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; AVX512POPCNT-NEXT: orq %rbx, %r10
+; AVX512POPCNT-NEXT: orq %r11, %r10
+; AVX512POPCNT-NEXT: cmovel %ecx, %eax
+; AVX512POPCNT-NEXT: popq %rbx
+; AVX512POPCNT-NEXT: popq %r14
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1)
+ %res = trunc i1024 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind {
+; SSE-LABEL: load_ctlz_undef_i1024:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq 40(%rdi), %rbp
+; SSE-NEXT: movq 64(%rdi), %rbx
+; SSE-NEXT: movq 72(%rdi), %r11
+; SSE-NEXT: movq 80(%rdi), %r12
+; SSE-NEXT: movq 88(%rdi), %r14
+; SSE-NEXT: movq 96(%rdi), %r13
+; SSE-NEXT: movq 104(%rdi), %r9
+; SSE-NEXT: movq 112(%rdi), %r10
+; SSE-NEXT: movq 120(%rdi), %r8
+; SSE-NEXT: bsrq %r8, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %r10, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: orl $64, %ecx
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %eax, %ecx
+; SSE-NEXT: bsrq %r9, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: bsrq %r13, %rax
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: movq %r10, %rdx
+; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: orq %r8, %rdx
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: bsrq %r14, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: movq %r12, %rsi
+; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: bsrq %r12, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: orl $64, %edx
+; SSE-NEXT: testq %r14, %r14
+; SSE-NEXT: cmovnel %ecx, %edx
+; SSE-NEXT: bsrq %r11, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: bsrq %rbx, %r15
+; SSE-NEXT: xorl $63, %r15d
+; SSE-NEXT: orl $64, %r15d
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovnel %ecx, %r15d
+; SSE-NEXT: movq 48(%rdi), %r12
+; SSE-NEXT: subl $-128, %r15d
+; SSE-NEXT: movq %rsi, %rcx
+; SSE-NEXT: orq %r14, %rcx
+; SSE-NEXT: cmovnel %edx, %r15d
+; SSE-NEXT: addl $256, %r15d # imm = 0x100
+; SSE-NEXT: movq %r9, %rcx
+; SSE-NEXT: orq %r8, %rcx
+; SSE-NEXT: movq %r13, %rdx
+; SSE-NEXT: orq %r10, %rdx
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: movq 56(%rdi), %r13
+; SSE-NEXT: cmovnel %eax, %r15d
+; SSE-NEXT: bsrq %r13, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %r12, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: orl $64, %edx
+; SSE-NEXT: testq %r13, %r13
+; SSE-NEXT: cmovnel %eax, %edx
+; SSE-NEXT: movq %rbp, %r10
+; SSE-NEXT: bsrq %rbp, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: movq 32(%rdi), %r8
+; SSE-NEXT: bsrq %r8, %rbp
+; SSE-NEXT: xorl $63, %ebp
+; SSE-NEXT: orl $64, %ebp
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %eax, %ebp
+; SSE-NEXT: subl $-128, %ebp
+; SSE-NEXT: movq %r12, %rax
+; SSE-NEXT: orq %r13, %rax
+; SSE-NEXT: cmovnel %edx, %ebp
+; SSE-NEXT: movq 24(%rdi), %r9
+; SSE-NEXT: bsrq %r9, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: movq 16(%rdi), %rsi
+; SSE-NEXT: bsrq %rsi, %rcx
+; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: orl $64, %ecx
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %eax, %ecx
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: bsrq (%rdi), %rax
+; SSE-NEXT: bsrq %rdx, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %r9, %rsi
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: orq %r13, %r10
+; SSE-NEXT: orq %r12, %r8
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: orq %r10, %r8
+; SSE-NEXT: cmovnel %ebp, %eax
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE-NEXT: orq %r14, %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE-NEXT: orq %rcx, %rbx
+; SSE-NEXT: addl $512, %eax # imm = 0x200
+; SSE-NEXT: orq %r11, %rbx
+; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: load_ctlz_undef_i1024:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq 48(%rdi), %r9
+; AVX2-NEXT: movq 56(%rdi), %rbp
+; AVX2-NEXT: movq 64(%rdi), %r11
+; AVX2-NEXT: movq 72(%rdi), %r10
+; AVX2-NEXT: movq 80(%rdi), %r14
+; AVX2-NEXT: movq 88(%rdi), %rbx
+; AVX2-NEXT: movq 96(%rdi), %rdx
+; AVX2-NEXT: movq 104(%rdi), %r8
+; AVX2-NEXT: movq 112(%rdi), %rsi
+; AVX2-NEXT: movq 120(%rdi), %r15
+; AVX2-NEXT: lzcntq %r15, %rax
+; AVX2-NEXT: lzcntq %rsi, %rcx
+; AVX2-NEXT: addl $64, %ecx
+; AVX2-NEXT: testq %r15, %r15
+; AVX2-NEXT: cmovnel %eax, %ecx
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: lzcntq %r8, %r12
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rdx, %rax
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: movq %rsi, %r12
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: orq %r15, %r12
+; AVX2-NEXT: cmovnel %ecx, %eax
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: lzcntq %rbx, %rcx
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: lzcntq %r14, %r13
+; AVX2-NEXT: addl $64, %r13d
+; AVX2-NEXT: testq %rbx, %rbx
+; AVX2-NEXT: cmovnel %ecx, %r13d
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: lzcntq %r10, %rcx
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: lzcntq %r11, %r12
+; AVX2-NEXT: addl $64, %r12d
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %ecx, %r12d
+; AVX2-NEXT: subl $-128, %r12d
+; AVX2-NEXT: movq %r14, %rcx
+; AVX2-NEXT: orq %rbx, %rcx
+; AVX2-NEXT: cmovnel %r13d, %r12d
+; AVX2-NEXT: addl $256, %r12d # imm = 0x100
+; AVX2-NEXT: movq %r8, %rcx
+; AVX2-NEXT: orq %r15, %rcx
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %eax, %r12d
+; AVX2-NEXT: movq %rbp, %r14
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: lzcntq %rbp, %rcx
+; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %r9, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rbp, %rbp
+; AVX2-NEXT: cmovnel %ecx, %eax
+; AVX2-NEXT: movq 32(%rdi), %r13
+; AVX2-NEXT: xorl %ebp, %ebp
+; AVX2-NEXT: lzcntq %r13, %rbp
+; AVX2-NEXT: addl $64, %ebp
+; AVX2-NEXT: movq 40(%rdi), %r8
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: lzcntq %r8, %rdx
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %edx, %ebp
+; AVX2-NEXT: subl $-128, %ebp
+; AVX2-NEXT: movq %r9, %rdx
+; AVX2-NEXT: orq %r14, %rdx
+; AVX2-NEXT: cmovnel %eax, %ebp
+; AVX2-NEXT: movq 16(%rdi), %r9
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: lzcntq %r9, %rcx
+; AVX2-NEXT: addl $64, %ecx
+; AVX2-NEXT: movq 24(%rdi), %rdx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rdx, %rax
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %eax, %ecx
+; AVX2-NEXT: movq 8(%rdi), %rsi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq (%rdi), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: lzcntq %rsi, %rdi
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rdx, %r9
+; AVX2-NEXT: cmovnel %ecx, %eax
+; AVX2-NEXT: orq %r14, %r8
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %r8, %r13
+; AVX2-NEXT: cmovnel %ebp, %eax
+; AVX2-NEXT: orq %r15, %rbx
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; AVX2-NEXT: orq %rbx, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT: orq %rcx, %r11
+; AVX2-NEXT: addl $512, %eax # imm = 0x200
+; AVX2-NEXT: orq %r10, %r11
+; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: load_ctlz_undef_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq 80(%rdi), %rsi
+; AVX512F-NEXT: movq 64(%rdi), %rcx
+; AVX512F-NEXT: movq 72(%rdi), %rdx
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512F-NEXT: movq 88(%rdi), %r8
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT: vmovd %xmm1, %r9d
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: orq 120(%rdi), %r8
+; AVX512F-NEXT: addl $512, %eax # imm = 0x200
+; AVX512F-NEXT: orq 104(%rdi), %rdx
+; AVX512F-NEXT: orq %r8, %rdx
+; AVX512F-NEXT: orq 112(%rdi), %rsi
+; AVX512F-NEXT: orq 96(%rdi), %rcx
+; AVX512F-NEXT: orq %rsi, %rcx
+; AVX512F-NEXT: orq %rdx, %rcx
+; AVX512F-NEXT: cmovnel %r9d, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_ctlz_undef_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movq 80(%rdi), %rsi
+; AVX512VL-NEXT: movq 64(%rdi), %rcx
+; AVX512VL-NEXT: movq 72(%rdi), %rdx
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512VL-NEXT: movq 88(%rdi), %r8
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm1, %r9d
+; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
+; AVX512VL-NEXT: orq 120(%rdi), %r8
+; AVX512VL-NEXT: orq 104(%rdi), %rdx
+; AVX512VL-NEXT: orq 112(%rdi), %rsi
+; AVX512VL-NEXT: orq %r8, %rdx
+; AVX512VL-NEXT: orq 96(%rdi), %rcx
+; AVX512VL-NEXT: orq %rsi, %rcx
+; AVX512VL-NEXT: orq %rdx, %rcx
+; AVX512VL-NEXT: cmovnel %r9d, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_undef_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: movq 80(%rdi), %rsi
+; AVX512POPCNT-NEXT: movq 64(%rdi), %rcx
+; AVX512POPCNT-NEXT: movq 72(%rdi), %rdx
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512POPCNT-NEXT: movq 88(%rdi), %r8
+; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %r9d
+; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
+; AVX512POPCNT-NEXT: orq 120(%rdi), %r8
+; AVX512POPCNT-NEXT: orq 104(%rdi), %rdx
+; AVX512POPCNT-NEXT: orq 112(%rdi), %rsi
+; AVX512POPCNT-NEXT: orq %r8, %rdx
+; AVX512POPCNT-NEXT: orq 96(%rdi), %rcx
+; AVX512POPCNT-NEXT: orq %rsi, %rcx
+; AVX512POPCNT-NEXT: orq %rdx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %r9d, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%a0 = load i1024, ptr %p0
- %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0)
+ %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1)
%res = trunc i1024 %cnt to i32
ret i32 %res
}
@@ -1886,6 +4634,49 @@ define i32 @load_cttz_i128(ptr %p0) nounwind {
ret i32 %res
}
+define i32 @vector_cttz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rcx
+; SSE-NEXT: movq %xmm0, %rdx
+; SSE-NEXT: rep bsfq %rdx, %rsi
+; SSE-NEXT: movl $64, %eax
+; SSE-NEXT: rep bsfq %rcx, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: tzcntq %rcx, %rdx
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vector_cttz_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: tzcntq %rcx, %rdx
+; AVX512-NEXT: tzcntq %rax, %rax
+; AVX512-NEXT: addl $64, %eax
+; AVX512-NEXT: testq %rcx, %rcx
+; AVX512-NEXT: cmovnel %edx, %eax
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_cttz_i256(i256 %a0) nounwind {
; SSE-LABEL: test_cttz_i256:
; SSE: # %bb.0:
@@ -1992,32 +4783,184 @@ define i32 @load_cttz_i256(ptr %p0) nounwind {
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_cttz_i256:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq 16(%rdi), %rcx
-; AVX512-NEXT: movq (%rdi), %rdx
-; AVX512-NEXT: movq 8(%rdi), %rsi
-; AVX512-NEXT: tzcntq %rdx, %rax
-; AVX512-NEXT: tzcntq %rsi, %r8
-; AVX512-NEXT: addl $64, %r8d
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %r8d
-; AVX512-NEXT: tzcntq %rcx, %r9
-; AVX512-NEXT: tzcntq 24(%rdi), %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %r9d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %rsi, %rdx
-; AVX512-NEXT: cmovnel %r8d, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_cttz_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [256,256,256,256]
+; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpandn %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vplzcntq %zmm2, %zmm2
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [64,128,192,256]
+; AVX512F-NEXT: vpsubq %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_cttz_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vplzcntq %ymm1, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512VL-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%a0 = load i256, ptr %p0
%cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
%res = trunc i256 %cnt to i32
ret i32 %res
}
+define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm1, %rcx
+; SSE-NEXT: pextrq $1, %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rdx
+; SSE-NEXT: rep bsfq %rdx, %rsi
+; SSE-NEXT: rep bsfq %rax, %rdi
+; SSE-NEXT: addl $64, %edi
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %edi
+; SSE-NEXT: movq %xmm1, %rdx
+; SSE-NEXT: rep bsfq %rdx, %rsi
+; SSE-NEXT: movl $64, %eax
+; SSE-NEXT: rep bsfq %rcx, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: vmovq %xmm0, %rsi
+; AVX2-NEXT: tzcntq %rsi, %rdi
+; AVX2-NEXT: tzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: tzcntq %rcx, %rdi
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_cttz_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vmovq %xmm1, %rcx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rsi
+; AVX512F-NEXT: tzcntq %rsi, %rdi
+; AVX512F-NEXT: tzcntq %rdx, %r8
+; AVX512F-NEXT: addl $64, %r8d
+; AVX512F-NEXT: testq %rsi, %rsi
+; AVX512F-NEXT: cmovnel %edi, %r8d
+; AVX512F-NEXT: tzcntq %rcx, %rdi
+; AVX512F-NEXT: tzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edi, %eax
+; AVX512F-NEXT: subl $-128, %eax
+; AVX512F-NEXT: orq %rdx, %rsi
+; AVX512F-NEXT: cmovnel %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_cttz_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT: vmovq %xmm1, %rcx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT: vmovq %xmm0, %rsi
+; AVX512VL-NEXT: tzcntq %rsi, %rdi
+; AVX512VL-NEXT: tzcntq %rdx, %r8
+; AVX512VL-NEXT: addl $64, %r8d
+; AVX512VL-NEXT: testq %rsi, %rsi
+; AVX512VL-NEXT: cmovnel %edi, %r8d
+; AVX512VL-NEXT: tzcntq %rcx, %rdi
+; AVX512VL-NEXT: tzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edi, %eax
+; AVX512VL-NEXT: subl $-128, %eax
+; AVX512VL-NEXT: orq %rdx, %rsi
+; AVX512VL-NEXT: cmovnel %r8d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT: tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT: addl $64, %r8d
+; AVX512POPCNT-NEXT: testq %rsi, %rsi
+; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT: tzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edi, %eax
+; AVX512POPCNT-NEXT: subl $-128, %eax
+; AVX512POPCNT-NEXT: orq %rdx, %rsi
+; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <8 x i32> %v0 to i256
+ %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_cttz_i512(i512 %a0) nounwind {
; SSE-LABEL: test_cttz_i512:
; SSE: # %bb.0:
@@ -2109,47 +5052,84 @@ define i32 @test_cttz_i512(i512 %a0) nounwind {
; AVX2-NEXT: popq %r14
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cttz_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: tzcntq %rdi, %rax
-; AVX512-NEXT: tzcntq %rsi, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %rdi, %rdi
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: tzcntq %rdx, %rax
-; AVX512-NEXT: tzcntq %rcx, %r10
-; AVX512-NEXT: addl $64, %r10d
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %r10d
-; AVX512-NEXT: subl $-128, %r10d
-; AVX512-NEXT: movq %rdi, %rax
-; AVX512-NEXT: orq %rsi, %rax
-; AVX512-NEXT: cmovnel %ebx, %r10d
-; AVX512-NEXT: tzcntq %r8, %rax
-; AVX512-NEXT: tzcntq %r9, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %r8, %r8
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: tzcntq %r11, %r14
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %r11, %r11
-; AVX512-NEXT: cmovnel %r14d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %r9, %r8
-; AVX512-NEXT: cmovnel %ebx, %eax
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %rcx, %rsi
-; AVX512-NEXT: orq %rdx, %rdi
-; AVX512-NEXT: orq %rsi, %rdi
-; AVX512-NEXT: cmovnel %r10d, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cttz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vmovq %rdi, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_cttz_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovq %rcx, %xmm0
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vmovq %rdi, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vmovq %r9, %xmm1
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_cttz_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm1
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
%res = trunc i512 %cnt to i32
ret i32 %res
@@ -2263,59 +5243,199 @@ define i32 @load_cttz_i512(ptr %p0) nounwind {
; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_cttz_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq 48(%rdi), %r11
-; AVX512-NEXT: movq 40(%rdi), %r9
-; AVX512-NEXT: movq 32(%rdi), %r10
-; AVX512-NEXT: movq 24(%rdi), %r8
-; AVX512-NEXT: movq 16(%rdi), %rdx
-; AVX512-NEXT: movq (%rdi), %rcx
-; AVX512-NEXT: movq 8(%rdi), %rsi
-; AVX512-NEXT: tzcntq %rcx, %rax
-; AVX512-NEXT: tzcntq %rsi, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: tzcntq %rdx, %rax
-; AVX512-NEXT: tzcntq %r8, %rbx
-; AVX512-NEXT: addl $64, %ebx
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %ebx
-; AVX512-NEXT: subl $-128, %ebx
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: orq %rsi, %rax
-; AVX512-NEXT: cmovnel %r14d, %ebx
-; AVX512-NEXT: tzcntq %r10, %rax
-; AVX512-NEXT: tzcntq %r9, %r14
-; AVX512-NEXT: addl $64, %r14d
-; AVX512-NEXT: testq %r10, %r10
-; AVX512-NEXT: cmovnel %eax, %r14d
-; AVX512-NEXT: tzcntq 56(%rdi), %rax
-; AVX512-NEXT: tzcntq %r11, %rdi
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %r11, %r11
-; AVX512-NEXT: cmovnel %edi, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %r9, %r10
-; AVX512-NEXT: cmovnel %r14d, %eax
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r8, %rsi
-; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: orq %rsi, %rcx
-; AVX512-NEXT: cmovnel %ebx, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: retq
+; AVX512F-LABEL: load_cttz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_cttz_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_cttz_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%a0 = load i512, ptr %p0
%cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
%res = trunc i512 %cnt to i32
ret i32 %res
}
+define i32 @vector_cttz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm3, %rdx
+; SSE-NEXT: movq %xmm3, %rcx
+; SSE-NEXT: pextrq $1, %xmm2, %rax
+; SSE-NEXT: pextrq $1, %xmm1, %rsi
+; SSE-NEXT: movq %xmm1, %rdi
+; SSE-NEXT: pextrq $1, %xmm0, %r8
+; SSE-NEXT: movq %xmm0, %r9
+; SSE-NEXT: rep bsfq %r9, %r10
+; SSE-NEXT: rep bsfq %r8, %r8
+; SSE-NEXT: addl $64, %r8d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r10d, %r8d
+; SSE-NEXT: rep bsfq %rdi, %r9
+; SSE-NEXT: rep bsfq %rsi, %rsi
+; SSE-NEXT: addl $64, %esi
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %r9d, %esi
+; SSE-NEXT: movq %xmm2, %rdi
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %r8d, %esi
+; SSE-NEXT: rep bsfq %rdi, %r8
+; SSE-NEXT: rep bsfq %rax, %r9
+; SSE-NEXT: addl $64, %r9d
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %r8d, %r9d
+; SSE-NEXT: rep bsfq %rcx, %rdi
+; SSE-NEXT: movl $64, %eax
+; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %r9d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT: vmovq %xmm1, %r8
+; AVX2-NEXT: vmovq %xmm0, %r9
+; AVX2-NEXT: tzcntq %r9, %r10
+; AVX2-NEXT: tzcntq %rdi, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %r11d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: tzcntq %r8, %r10
+; AVX2-NEXT: tzcntq %rsi, %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %r10d, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %rdi, %r9
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: tzcntq %rdx, %rdi
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: vmovq %xmm2, %rdi
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: tzcntq %rdi, %r9
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_cttz_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_cttz_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <16 x i32> %v0 to i512
+ %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
define i32 @test_cttz_i1024(i1024 %a0) nounwind {
; SSE-LABEL: test_cttz_i1024:
; SSE: # %bb.0:
@@ -2547,111 +5667,136 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_cttz_i1024:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq %r9, %r14
-; AVX512-NEXT: movq %r8, %r15
-; AVX512-NEXT: movq %rcx, %r11
-; AVX512-NEXT: movq %rdx, %r10
-; AVX512-NEXT: movq %rsi, %r9
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: tzcntq %rdi, %rax
-; AVX512-NEXT: tzcntq %r9, %r12
-; AVX512-NEXT: addl $64, %r12d
-; AVX512-NEXT: testq %rdi, %rdi
-; AVX512-NEXT: cmovnel %eax, %r12d
-; AVX512-NEXT: tzcntq %rdx, %r13
-; AVX512-NEXT: tzcntq %r11, %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %r13d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: movq %rdi, %r13
-; AVX512-NEXT: orq %r9, %r13
-; AVX512-NEXT: cmovnel %r12d, %eax
-; AVX512-NEXT: tzcntq %r8, %r12
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: tzcntq %r14, %r13
-; AVX512-NEXT: addl $64, %r13d
-; AVX512-NEXT: testq %r8, %r8
-; AVX512-NEXT: cmovnel %r12d, %r13d
-; AVX512-NEXT: tzcntq %rcx, %rbp
-; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %r12
-; AVX512-NEXT: addl $64, %r12d
-; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %ebp, %r12d
-; AVX512-NEXT: subl $-128, %r12d
-; AVX512-NEXT: movq %r8, %rbp
-; AVX512-NEXT: orq %r14, %rbp
-; AVX512-NEXT: cmovnel %r13d, %r12d
-; AVX512-NEXT: addl $256, %r12d # imm = 0x100
-; AVX512-NEXT: movq %r9, %r13
-; AVX512-NEXT: orq %r11, %r13
-; AVX512-NEXT: movq %rdi, %rbp
-; AVX512-NEXT: orq %rdx, %rbp
-; AVX512-NEXT: orq %r13, %rbp
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX512-NEXT: cmovnel %eax, %r12d
-; AVX512-NEXT: tzcntq %rbx, %rbp
-; AVX512-NEXT: tzcntq %r13, %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %rbx, %rbx
-; AVX512-NEXT: cmovnel %ebp, %eax
-; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rbp
-; AVX512-NEXT: addl $64, %ebp
-; AVX512-NEXT: tzcntq %rsi, %rcx
-; AVX512-NEXT: testq %rsi, %rsi
-; AVX512-NEXT: cmovnel %ecx, %ebp
-; AVX512-NEXT: subl $-128, %ebp
-; AVX512-NEXT: movq %rbx, %rcx
-; AVX512-NEXT: orq %r13, %rcx
-; AVX512-NEXT: cmovnel %eax, %ebp
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT: tzcntq %r14, %rcx
-; AVX512-NEXT: addl $64, %ecx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT: tzcntq %rdx, %rax
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %ecx
-; AVX512-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT: tzcntq %r8, %rsi
-; AVX512-NEXT: testq %r8, %r8
-; AVX512-NEXT: cmovnel %esi, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %r14, %rdx
-; AVX512-NEXT: cmovnel %ecx, %eax
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r13
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r13, %rbx
-; AVX512-NEXT: cmovnel %ebp, %eax
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT: orq %r11, %r9
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: orq %r15, %rdi
-; AVX512-NEXT: orq %r10, %rdi
-; AVX512-NEXT: addl $512, %eax # imm = 0x200
-; AVX512-NEXT: orq %r9, %rdi
-; AVX512-NEXT: cmovnel %r12d, %eax
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: retq
+; AVX512F-LABEL: test_cttz_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vmovq %rdi, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vpandnq %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vplzcntq %zmm3, %zmm3
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %r10d
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpsubq %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: addl $512, %eax # imm = 0x200
+; AVX512F-NEXT: orq %r9, %rsi
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT: orq %rsi, %rcx
+; AVX512F-NEXT: orq %r8, %rdi
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rdx
+; AVX512F-NEXT: orq %rdi, %rdx
+; AVX512F-NEXT: orq %rcx, %rdx
+; AVX512F-NEXT: cmovnel %r10d, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_cttz_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovq %rcx, %xmm0
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vmovq %rdi, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vmovq %r9, %xmm2
+; AVX512VL-NEXT: vmovq %r8, %xmm3
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512VL-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %r10d
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm0
+; AVX512VL-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512VL-NEXT: vpsubq %zmm0, %zmm4, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
+; AVX512VL-NEXT: orq %r9, %rsi
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT: orq %rsi, %rcx
+; AVX512VL-NEXT: orq %r8, %rdi
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rdx
+; AVX512VL-NEXT: orq %rdi, %rdx
+; AVX512VL-NEXT: orq %rcx, %rdx
+; AVX512VL-NEXT: cmovnel %r10d, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_cttz_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm2
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm3
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3
+; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %r10d
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %eax
+; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
+; AVX512POPCNT-NEXT: orq %r9, %rsi
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rcx
+; AVX512POPCNT-NEXT: orq %rsi, %rcx
+; AVX512POPCNT-NEXT: orq %r8, %rdi
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rdx
+; AVX512POPCNT-NEXT: orq %rdi, %rdx
+; AVX512POPCNT-NEXT: orq %rcx, %rdx
+; AVX512POPCNT-NEXT: cmovnel %r10d, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0)
%res = trunc i1024 %cnt to i32
ret i32 %res
@@ -2900,122 +6045,1693 @@ define i32 @load_cttz_i1024(ptr %p0) nounwind {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512-LABEL: load_cttz_i1024:
+; AVX512F-LABEL: load_cttz_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1
+; AVX512F-NEXT: movq 16(%rdi), %rax
+; AVX512F-NEXT: movq (%rdi), %rcx
+; AVX512F-NEXT: movq 8(%rdi), %rdx
+; AVX512F-NEXT: movq 24(%rdi), %rsi
+; AVX512F-NEXT: orq 56(%rdi), %rsi
+; AVX512F-NEXT: orq 40(%rdi), %rdx
+; AVX512F-NEXT: orq 48(%rdi), %rax
+; AVX512F-NEXT: orq 32(%rdi), %rcx
+; AVX512F-NEXT: orq %rsi, %rdx
+; AVX512F-NEXT: orq %rax, %rcx
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm3
+; AVX512F-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT: vplzcntq %zmm3, %zmm3
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm1, %esi
+; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vpsubq %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: addl $512, %eax # imm = 0x200
+; AVX512F-NEXT: orq %rdx, %rcx
+; AVX512F-NEXT: cmovnel %esi, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_cttz_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm1
+; AVX512VL-NEXT: movq 16(%rdi), %rax
+; AVX512VL-NEXT: movq (%rdi), %rcx
+; AVX512VL-NEXT: movq 8(%rdi), %rdx
+; AVX512VL-NEXT: movq 24(%rdi), %rsi
+; AVX512VL-NEXT: orq 56(%rdi), %rsi
+; AVX512VL-NEXT: orq 40(%rdi), %rdx
+; AVX512VL-NEXT: orq %rsi, %rdx
+; AVX512VL-NEXT: orq 48(%rdi), %rax
+; AVX512VL-NEXT: orq 32(%rdi), %rcx
+; AVX512VL-NEXT: orq %rax, %rcx
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm3
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm1, %esi
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
+; AVX512VL-NEXT: orq %rdx, %rcx
+; AVX512VL-NEXT: cmovnel %esi, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_cttz_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm1
+; AVX512POPCNT-NEXT: movq 16(%rdi), %rax
+; AVX512POPCNT-NEXT: movq (%rdi), %rcx
+; AVX512POPCNT-NEXT: movq 8(%rdi), %rdx
+; AVX512POPCNT-NEXT: movq 24(%rdi), %rsi
+; AVX512POPCNT-NEXT: orq 56(%rdi), %rsi
+; AVX512POPCNT-NEXT: orq 40(%rdi), %rdx
+; AVX512POPCNT-NEXT: orq %rsi, %rdx
+; AVX512POPCNT-NEXT: orq 48(%rdi), %rax
+; AVX512POPCNT-NEXT: orq 32(%rdi), %rcx
+; AVX512POPCNT-NEXT: orq %rax, %rcx
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3
+; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %esi
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
+; AVX512POPCNT-NEXT: orq %rdx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %esi, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = load i1024, ptr %p0
+ %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0)
+ %res = trunc i1024 %cnt to i32
+ ret i32 %res
+}
+
+;
+; CTTZ_ZERO_UNDEF
+;
+
+define i32 @test_cttz_undef_i128(i128 %a0) nounwind {
+; SSE-LABEL: test_cttz_undef_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: rep bsfq %rdi, %rcx
+; SSE-NEXT: rep bsfq %rsi, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_cttz_undef_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: tzcntq %rdi, %rcx
+; AVX2-NEXT: tzcntq %rsi, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %ecx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cttz_undef_i128:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq 88(%rdi), %rbp
-; AVX512-NEXT: movq 72(%rdi), %r15
-; AVX512-NEXT: movq 56(%rdi), %r9
-; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rdi), %rcx
-; AVX512-NEXT: movq 40(%rdi), %r10
-; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 32(%rdi), %rsi
-; AVX512-NEXT: movq 24(%rdi), %r14
-; AVX512-NEXT: movq 16(%rdi), %rbx
-; AVX512-NEXT: movq (%rdi), %r8
-; AVX512-NEXT: movq 8(%rdi), %r11
-; AVX512-NEXT: tzcntq %r8, %rax
-; AVX512-NEXT: tzcntq %r11, %rdx
-; AVX512-NEXT: addl $64, %edx
-; AVX512-NEXT: testq %r8, %r8
-; AVX512-NEXT: cmovnel %eax, %edx
-; AVX512-NEXT: tzcntq %rbx, %r12
-; AVX512-NEXT: tzcntq %r14, %rax
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: tzcntq %rdi, %rcx
+; AVX512-NEXT: tzcntq %rsi, %rax
; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %rbx, %rbx
-; AVX512-NEXT: cmovnel %r12d, %eax
-; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: movq %r8, %r12
-; AVX512-NEXT: orq %r11, %r12
-; AVX512-NEXT: cmovnel %edx, %eax
-; AVX512-NEXT: tzcntq %rsi, %rdx
-; AVX512-NEXT: tzcntq %r10, %r13
-; AVX512-NEXT: addl $64, %r13d
-; AVX512-NEXT: testq %rsi, %rsi
-; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: cmovnel %edx, %r13d
-; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: testq %rdi, %rdi
+; AVX512-NEXT: cmovnel %ecx, %eax
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
+ %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @load_cttz_undef_i128(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_undef_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: rep bsfq %rcx, %rdx
+; SSE-NEXT: rep bsfq 8(%rdi), %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: load_cttz_undef_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq (%rdi), %rcx
+; AVX2-NEXT: tzcntq %rcx, %rdx
+; AVX2-NEXT: tzcntq 8(%rdi), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: load_cttz_undef_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq (%rdi), %rcx
; AVX512-NEXT: tzcntq %rcx, %rdx
-; AVX512-NEXT: tzcntq %r9, %r12
-; AVX512-NEXT: addl $64, %r12d
+; AVX512-NEXT: tzcntq 8(%rdi), %rax
+; AVX512-NEXT: addl $64, %eax
; AVX512-NEXT: testq %rcx, %rcx
-; AVX512-NEXT: cmovnel %edx, %r12d
-; AVX512-NEXT: subl $-128, %r12d
-; AVX512-NEXT: movq %rsi, %rdx
-; AVX512-NEXT: orq %r10, %rdx
-; AVX512-NEXT: cmovnel %r13d, %r12d
-; AVX512-NEXT: addl $256, %r12d # imm = 0x100
-; AVX512-NEXT: movq %r11, %rdx
-; AVX512-NEXT: orq %r14, %rdx
-; AVX512-NEXT: movq %r8, %r13
-; AVX512-NEXT: orq %rbx, %r13
-; AVX512-NEXT: orq %rdx, %r13
-; AVX512-NEXT: movq 64(%rdi), %r13
-; AVX512-NEXT: cmovnel %eax, %r12d
-; AVX512-NEXT: tzcntq %r13, %rdx
-; AVX512-NEXT: tzcntq %r15, %rax
+; AVX512-NEXT: cmovnel %edx, %eax
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
+ %a0 = load i128, ptr %p0
+ %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm0, %rax
+; SSE-NEXT: movq %xmm0, %rcx
+; SSE-NEXT: rep bsfq %rcx, %rdx
+; SSE-NEXT: rep bsfq %rax, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_undef_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: tzcntq %rcx, %rdx
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: vector_cttz_undef_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: tzcntq %rcx, %rdx
+; AVX512-NEXT: tzcntq %rax, %rax
; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: testq %r13, %r13
+; AVX512-NEXT: testq %rcx, %rcx
; AVX512-NEXT: cmovnel %edx, %eax
-; AVX512-NEXT: movq %rbp, %r14
-; AVX512-NEXT: tzcntq %rbp, %rbp
-; AVX512-NEXT: addl $64, %ebp
-; AVX512-NEXT: movq 80(%rdi), %r10
-; AVX512-NEXT: tzcntq %r10, %rcx
-; AVX512-NEXT: testq %r10, %r10
-; AVX512-NEXT: cmovnel %ecx, %ebp
-; AVX512-NEXT: subl $-128, %ebp
-; AVX512-NEXT: movq %r13, %rcx
-; AVX512-NEXT: orq %r15, %rcx
-; AVX512-NEXT: cmovnel %eax, %ebp
-; AVX512-NEXT: movq 104(%rdi), %r9
-; AVX512-NEXT: tzcntq %r9, %rcx
-; AVX512-NEXT: addl $64, %ecx
-; AVX512-NEXT: movq 96(%rdi), %rdx
-; AVX512-NEXT: tzcntq %rdx, %rax
-; AVX512-NEXT: testq %rdx, %rdx
-; AVX512-NEXT: cmovnel %eax, %ecx
-; AVX512-NEXT: movq 112(%rdi), %rsi
-; AVX512-NEXT: tzcntq 120(%rdi), %rax
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
+ %a0 = bitcast <4 x i32> %v0 to i128
+ %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1)
+ %res = trunc i128 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @test_cttz_undef_i256(i256 %a0) nounwind {
+; SSE-LABEL: test_cttz_undef_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: rep bsfq %rdi, %rax
+; SSE-NEXT: rep bsfq %rsi, %r8
+; SSE-NEXT: addl $64, %r8d
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %eax, %r8d
+; SSE-NEXT: rep bsfq %rdx, %r9
+; SSE-NEXT: rep bsfq %rcx, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %r9d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %rsi, %rdi
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_cttz_undef_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: tzcntq %rdi, %rax
+; AVX2-NEXT: tzcntq %rsi, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: tzcntq %rdx, %r9
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %rcx, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rsi, %rdi
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_cttz_undef_i256:
+; AVX512: # %bb.0:
+; AVX512-NEXT: tzcntq %rdi, %rax
+; AVX512-NEXT: tzcntq %rsi, %r8
+; AVX512-NEXT: addl $64, %r8d
+; AVX512-NEXT: testq %rdi, %rdi
+; AVX512-NEXT: cmovnel %eax, %r8d
+; AVX512-NEXT: tzcntq %rdx, %r9
+; AVX512-NEXT: tzcntq %rcx, %rax
; AVX512-NEXT: addl $64, %eax
-; AVX512-NEXT: tzcntq %rsi, %rdi
-; AVX512-NEXT: testq %rsi, %rsi
-; AVX512-NEXT: cmovnel %edi, %eax
+; AVX512-NEXT: testq %rdx, %rdx
+; AVX512-NEXT: cmovnel %r9d, %eax
; AVX512-NEXT: subl $-128, %eax
-; AVX512-NEXT: orq %r9, %rdx
-; AVX512-NEXT: cmovnel %ecx, %eax
-; AVX512-NEXT: orq %r14, %r15
-; AVX512-NEXT: orq %r10, %r13
-; AVX512-NEXT: addl $256, %eax # imm = 0x100
-; AVX512-NEXT: orq %r15, %r13
-; AVX512-NEXT: cmovnel %ebp, %eax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX512-NEXT: orq %rcx, %r11
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT: orq %rbx, %r8
-; AVX512-NEXT: addl $512, %eax # imm = 0x200
-; AVX512-NEXT: orq %r11, %r8
-; AVX512-NEXT: cmovnel %r12d, %eax
+; AVX512-NEXT: orq %rsi, %rdi
+; AVX512-NEXT: cmovnel %r8d, %eax
; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: retq
+ %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @load_cttz_undef_i256(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_undef_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: movq 16(%rdi), %rcx
+; SSE-NEXT: movq (%rdi), %rdx
+; SSE-NEXT: movq 8(%rdi), %rsi
+; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: rep bsfq %rsi, %r8
+; SSE-NEXT: addl $64, %r8d
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %eax, %r8d
+; SSE-NEXT: rep bsfq %rcx, %r9
+; SSE-NEXT: rep bsfq 24(%rdi), %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %r9d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %rsi, %rdx
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: load_cttz_undef_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq (%rdi), %rcx
+; AVX2-NEXT: movq 8(%rdi), %rdx
+; AVX2-NEXT: tzcntq %rcx, %rax
+; AVX2-NEXT: tzcntq %rdx, %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %esi
+; AVX2-NEXT: movq 16(%rdi), %r8
+; AVX2-NEXT: tzcntq %r8, %r9
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq 24(%rdi), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: load_cttz_undef_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512F-NEXT: vpsubq %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_cttz_undef_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512VL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [64,128,192,256]
+; AVX512VL-NEXT: vplzcntq %ymm1, %ymm1
+; AVX512VL-NEXT: vpsubq %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_cttz_undef_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT: vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %ymm1, %ymm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = load i256, ptr %p0
+ %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm1, %rax
+; SSE-NEXT: movq %xmm1, %rcx
+; SSE-NEXT: pextrq $1, %xmm0, %rdx
+; SSE-NEXT: movq %xmm0, %rsi
+; SSE-NEXT: rep bsfq %rsi, %rdi
+; SSE-NEXT: rep bsfq %rdx, %rdx
+; SSE-NEXT: addl $64, %edx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %edx
+; SSE-NEXT: rep bsfq %rcx, %rsi
+; SSE-NEXT: rep bsfq %rax, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_undef_i256:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: vmovq %xmm1, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT: vmovq %xmm0, %rsi
+; AVX2-NEXT: tzcntq %rsi, %rdi
+; AVX2-NEXT: tzcntq %rdx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: tzcntq %rcx, %rdi
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i256:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT: vmovq %xmm1, %rcx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rsi
+; AVX512F-NEXT: tzcntq %rsi, %rdi
+; AVX512F-NEXT: tzcntq %rdx, %r8
+; AVX512F-NEXT: addl $64, %r8d
+; AVX512F-NEXT: testq %rsi, %rsi
+; AVX512F-NEXT: cmovnel %edi, %r8d
+; AVX512F-NEXT: tzcntq %rcx, %rdi
+; AVX512F-NEXT: tzcntq %rax, %rax
+; AVX512F-NEXT: addl $64, %eax
+; AVX512F-NEXT: testq %rcx, %rcx
+; AVX512F-NEXT: cmovnel %edi, %eax
+; AVX512F-NEXT: subl $-128, %eax
+; AVX512F-NEXT: orq %rdx, %rsi
+; AVX512F-NEXT: cmovnel %r8d, %eax
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i256:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT: vmovq %xmm1, %rcx
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT: vmovq %xmm0, %rsi
+; AVX512VL-NEXT: tzcntq %rsi, %rdi
+; AVX512VL-NEXT: tzcntq %rdx, %r8
+; AVX512VL-NEXT: addl $64, %r8d
+; AVX512VL-NEXT: testq %rsi, %rsi
+; AVX512VL-NEXT: cmovnel %edi, %r8d
+; AVX512VL-NEXT: tzcntq %rcx, %rdi
+; AVX512VL-NEXT: tzcntq %rax, %rax
+; AVX512VL-NEXT: addl $64, %eax
+; AVX512VL-NEXT: testq %rcx, %rcx
+; AVX512VL-NEXT: cmovnel %edi, %eax
+; AVX512VL-NEXT: subl $-128, %eax
+; AVX512VL-NEXT: orq %rdx, %rsi
+; AVX512VL-NEXT: cmovnel %r8d, %eax
+; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i256:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT: vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT: vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT: tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT: tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT: addl $64, %r8d
+; AVX512POPCNT-NEXT: testq %rsi, %rsi
+; AVX512POPCNT-NEXT: cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT: tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT: tzcntq %rax, %rax
+; AVX512POPCNT-NEXT: addl $64, %eax
+; AVX512POPCNT-NEXT: testq %rcx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %edi, %eax
+; AVX512POPCNT-NEXT: subl $-128, %eax
+; AVX512POPCNT-NEXT: orq %rdx, %rsi
+; AVX512POPCNT-NEXT: cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <8 x i32> %v0 to i256
+ %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
+ %res = trunc i256 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @test_cttz_undef_i512(i512 %a0) nounwind {
+; SSE-LABEL: test_cttz_undef_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: rep bsfq %rdi, %rax
+; SSE-NEXT: rep bsfq %rsi, %r11
+; SSE-NEXT: addl $64, %r11d
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %eax, %r11d
+; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: rep bsfq %rcx, %r10
+; SSE-NEXT: addl $64, %r10d
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %eax, %r10d
+; SSE-NEXT: subl $-128, %r10d
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: orq %rsi, %rax
+; SSE-NEXT: cmovnel %r11d, %r10d
+; SSE-NEXT: rep bsfq %r8, %rax
+; SSE-NEXT: rep bsfq %r9, %r11
+; SSE-NEXT: addl $64, %r11d
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %eax, %r11d
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT: rep bsfq %rbx, %r14
+; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rbx, %rbx
+; SSE-NEXT: cmovnel %r14d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: cmovnel %r11d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: orq %rcx, %rsi
+; SSE-NEXT: orq %rdx, %rdi
+; SSE-NEXT: orq %rsi, %rdi
+; SSE-NEXT: cmovnel %r10d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_cttz_undef_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: tzcntq %rdi, %rax
+; AVX2-NEXT: tzcntq %rsi, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %eax, %r11d
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %rdx, %rax
+; AVX2-NEXT: tzcntq %rcx, %r10
+; AVX2-NEXT: addl $64, %r10d
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %eax, %r10d
+; AVX2-NEXT: subl $-128, %r10d
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: orq %rsi, %rax
+; AVX2-NEXT: cmovnel %r11d, %r10d
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %r8, %rax
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %r9, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: tzcntq %r11, %r14
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %r14d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %ebx, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %rcx, %rsi
+; AVX2-NEXT: orq %rdx, %rdi
+; AVX2-NEXT: orq %rsi, %rdi
+; AVX2-NEXT: cmovnel %r10d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_cttz_undef_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vmovq %rdi, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_cttz_undef_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovq %rcx, %xmm0
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vmovq %rdi, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vmovq %r9, %xmm1
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_cttz_undef_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm2
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm3
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm2, %ymm2
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @load_cttz_undef_i512(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_undef_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq 40(%rdi), %r9
+; SSE-NEXT: movq 24(%rdi), %r8
+; SSE-NEXT: movq 16(%rdi), %rdx
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %rsi
+; SSE-NEXT: rep bsfq %rcx, %rax
+; SSE-NEXT: rep bsfq %rsi, %r11
+; SSE-NEXT: addl $64, %r11d
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %eax, %r11d
+; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: rep bsfq %r8, %r10
+; SSE-NEXT: addl $64, %r10d
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %eax, %r10d
+; SSE-NEXT: movq 32(%rdi), %rbx
+; SSE-NEXT: subl $-128, %r10d
+; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: orq %rsi, %rax
+; SSE-NEXT: cmovnel %r11d, %r10d
+; SSE-NEXT: rep bsfq %rbx, %rax
+; SSE-NEXT: rep bsfq %r9, %r11
+; SSE-NEXT: addl $64, %r11d
+; SSE-NEXT: testq %rbx, %rbx
+; SSE-NEXT: cmovnel %eax, %r11d
+; SSE-NEXT: movq 48(%rdi), %r14
+; SSE-NEXT: rep bsfq %r14, %r15
+; SSE-NEXT: rep bsfq 56(%rdi), %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %r14, %r14
+; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %r9, %rbx
+; SSE-NEXT: cmovnel %r11d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: orq %r8, %rsi
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: orq %rsi, %rcx
+; SSE-NEXT: cmovnel %r10d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: load_cttz_undef_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq 48(%rdi), %r10
+; AVX2-NEXT: movq 40(%rdi), %r9
+; AVX2-NEXT: movq 24(%rdi), %r8
+; AVX2-NEXT: movq 16(%rdi), %rdx
+; AVX2-NEXT: movq (%rdi), %rcx
+; AVX2-NEXT: movq 8(%rdi), %rsi
+; AVX2-NEXT: tzcntq %rcx, %rax
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %rsi, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %rdx, %rax
+; AVX2-NEXT: tzcntq %r8, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %eax, %r11d
+; AVX2-NEXT: subl $-128, %r11d
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: orq %rsi, %rax
+; AVX2-NEXT: cmovnel %ebx, %r11d
+; AVX2-NEXT: movq 32(%rdi), %rbx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %rbx, %rax
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: tzcntq %r9, %r14
+; AVX2-NEXT: addl $64, %r14d
+; AVX2-NEXT: testq %rbx, %rbx
+; AVX2-NEXT: cmovnel %eax, %r14d
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: tzcntq %r10, %r15
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq 56(%rdi), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %r9, %rbx
+; AVX2-NEXT: cmovnel %r14d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: load_cttz_undef_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_cttz_undef_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_cttz_undef_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = load i512, ptr %p0
+ %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pextrq $1, %xmm3, %rax
+; SSE-NEXT: pextrq $1, %xmm2, %rdx
+; SSE-NEXT: pextrq $1, %xmm1, %rcx
+; SSE-NEXT: movq %xmm1, %rsi
+; SSE-NEXT: pextrq $1, %xmm0, %rdi
+; SSE-NEXT: movq %xmm0, %r8
+; SSE-NEXT: rep bsfq %r8, %r9
+; SSE-NEXT: rep bsfq %rdi, %rdi
+; SSE-NEXT: addl $64, %edi
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %r9d, %edi
+; SSE-NEXT: rep bsfq %rsi, %r8
+; SSE-NEXT: rep bsfq %rcx, %rcx
+; SSE-NEXT: addl $64, %ecx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %r8d, %ecx
+; SSE-NEXT: movq %xmm2, %rsi
+; SSE-NEXT: subl $-128, %ecx
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %edi, %ecx
+; SSE-NEXT: rep bsfq %rsi, %rdi
+; SSE-NEXT: rep bsfq %rdx, %rdx
+; SSE-NEXT: addl $64, %edx
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %edx
+; SSE-NEXT: movq %xmm3, %rsi
+; SSE-NEXT: rep bsfq %rsi, %rdi
+; SSE-NEXT: rep bsfq %rax, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: vector_cttz_undef_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT: vmovq %xmm1, %r8
+; AVX2-NEXT: vmovq %xmm0, %r9
+; AVX2-NEXT: tzcntq %r9, %r10
+; AVX2-NEXT: tzcntq %rdi, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %r11d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: tzcntq %r8, %r10
+; AVX2-NEXT: tzcntq %rsi, %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %r10d, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %rdi, %r9
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: xorl %edi, %edi
+; AVX2-NEXT: tzcntq %rdx, %rdi
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %edi, %r8d
+; AVX2-NEXT: vmovq %xmm2, %rdi
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: tzcntq %rdi, %r9
+; AVX2-NEXT: tzcntq %rax, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %r9d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i512:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %a0 = bitcast <16 x i32> %v0 to i512
+ %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
+ %res = trunc i512 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
+; SSE-LABEL: test_cttz_undef_i1024:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq %r9, %r14
+; SSE-NEXT: movq %rcx, %rbx
+; SSE-NEXT: movq %rdx, %r10
+; SSE-NEXT: movq %rsi, %r9
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: rep bsfq %rdi, %rax
+; SSE-NEXT: rep bsfq %rsi, %r12
+; SSE-NEXT: addl $64, %r12d
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %eax, %r12d
+; SSE-NEXT: rep bsfq %r10, %r15
+; SSE-NEXT: rep bsfq %rbx, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: movq %rdi, %r13
+; SSE-NEXT: orq %rsi, %r13
+; SSE-NEXT: cmovnel %r12d, %eax
+; SSE-NEXT: movq %r8, %r15
+; SSE-NEXT: rep bsfq %r8, %r12
+; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: rep bsfq %r14, %r13
+; SSE-NEXT: addl $64, %r13d
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %r12d, %r13d
+; SSE-NEXT: rep bsfq %rcx, %rbp
+; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT: addl $64, %r12d
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %ebp, %r12d
+; SSE-NEXT: subl $-128, %r12d
+; SSE-NEXT: movq %r8, %rbp
+; SSE-NEXT: orq %r14, %rbp
+; SSE-NEXT: cmovnel %r13d, %r12d
+; SSE-NEXT: addl $256, %r12d # imm = 0x100
+; SSE-NEXT: movq %rsi, %r13
+; SSE-NEXT: orq %rbx, %r13
+; SSE-NEXT: movq %rdi, %rbp
+; SSE-NEXT: orq %r10, %rbp
+; SSE-NEXT: orq %r13, %rbp
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT: cmovnel %eax, %r12d
+; SSE-NEXT: rep bsfq %r11, %rbp
+; SSE-NEXT: rep bsfq %r13, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovnel %ebp, %eax
+; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rbp
+; SSE-NEXT: addl $64, %ebp
+; SSE-NEXT: rep bsfq %rdx, %rcx
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %ecx, %ebp
+; SSE-NEXT: subl $-128, %ebp
+; SSE-NEXT: movq %r11, %rcx
+; SSE-NEXT: orq %r13, %rcx
+; SSE-NEXT: cmovnel %eax, %ebp
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT: rep bsfq %r14, %rcx
+; SSE-NEXT: addl $64, %ecx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %eax, %ecx
+; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT: rep bsfq %r8, %rsi
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %r14, %rdx
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: orq %r13, %r11
+; SSE-NEXT: cmovnel %ebp, %eax
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; SSE-NEXT: orq %rbx, %r9
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: orq %r15, %rdi
+; SSE-NEXT: orq %r10, %rdi
+; SSE-NEXT: addl $512, %eax # imm = 0x200
+; SSE-NEXT: orq %r9, %rdi
+; SSE-NEXT: cmovnel %r12d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_cttz_undef_i1024:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq %r9, %rbx
+; AVX2-NEXT: movq %r8, %r14
+; AVX2-NEXT: movq %rcx, %r11
+; AVX2-NEXT: movq %rdx, %r10
+; AVX2-NEXT: movq %rsi, %r9
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: tzcntq %rdi, %rax
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: tzcntq %r9, %r15
+; AVX2-NEXT: addl $64, %r15d
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %eax, %r15d
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: tzcntq %r10, %r12
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %r11, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: movq %rdi, %r12
+; AVX2-NEXT: orq %r9, %r12
+; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: tzcntq %r14, %r15
+; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: tzcntq %rbx, %r12
+; AVX2-NEXT: addl $64, %r12d
+; AVX2-NEXT: testq %r14, %r14
+; AVX2-NEXT: cmovnel %r15d, %r12d
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: tzcntq %rcx, %r13
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: tzcntq %rdx, %r15
+; AVX2-NEXT: addl $64, %r15d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %r13d, %r15d
+; AVX2-NEXT: subl $-128, %r15d
+; AVX2-NEXT: movq %r14, %r13
+; AVX2-NEXT: orq %rbx, %r13
+; AVX2-NEXT: cmovnel %r12d, %r15d
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT: addl $256, %r15d # imm = 0x100
+; AVX2-NEXT: movq %r9, %r13
+; AVX2-NEXT: orq %r11, %r13
+; AVX2-NEXT: movq %rdi, %rbp
+; AVX2-NEXT: orq %r10, %rbp
+; AVX2-NEXT: orq %r13, %rbp
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT: cmovnel %eax, %r15d
+; AVX2-NEXT: xorl %ebp, %ebp
+; AVX2-NEXT: tzcntq %r12, %rbp
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %r13, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %r12, %r12
+; AVX2-NEXT: cmovnel %ebp, %eax
+; AVX2-NEXT: xorl %ebp, %ebp
+; AVX2-NEXT: tzcntq %r8, %rbp
+; AVX2-NEXT: addl $64, %ebp
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: tzcntq %rsi, %rcx
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %ecx, %ebp
+; AVX2-NEXT: subl $-128, %ebp
+; AVX2-NEXT: movq %r12, %rcx
+; AVX2-NEXT: orq %r13, %rcx
+; AVX2-NEXT: cmovnel %eax, %ebp
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: tzcntq %rbx, %rcx
+; AVX2-NEXT: addl $64, %ecx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %rdx, %rax
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %eax, %ecx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT: tzcntq %r8, %rsi
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %rbx, %rdx
+; AVX2-NEXT: cmovnel %ecx, %eax
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %r13, %r12
+; AVX2-NEXT: cmovnel %ebp, %eax
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX2-NEXT: orq %r11, %r9
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: orq %r14, %rdi
+; AVX2-NEXT: orq %r10, %rdi
+; AVX2-NEXT: addl $512, %eax # imm = 0x200
+; AVX2-NEXT: orq %r9, %rdi
+; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: test_cttz_undef_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vmovq %rdi, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vpandnq %zmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vplzcntq %zmm3, %zmm3
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %r10d
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpsubq %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: addl $512, %eax # imm = 0x200
+; AVX512F-NEXT: orq %r9, %rsi
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT: orq %rsi, %rcx
+; AVX512F-NEXT: orq %r8, %rdi
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rdx
+; AVX512F-NEXT: orq %rdi, %rdx
+; AVX512F-NEXT: orq %rcx, %rdx
+; AVX512F-NEXT: cmovnel %r10d, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: test_cttz_undef_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovq %rcx, %xmm0
+; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT: vmovq %rsi, %xmm1
+; AVX512VL-NEXT: vmovq %rdi, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vmovq %r9, %xmm1
+; AVX512VL-NEXT: vmovq %r8, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %r10d
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm0
+; AVX512VL-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512VL-NEXT: vpsubq %zmm0, %zmm4, %zmm0
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
+; AVX512VL-NEXT: orq %r9, %rsi
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT: orq %rsi, %rcx
+; AVX512VL-NEXT: orq %r8, %rdi
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rdx
+; AVX512VL-NEXT: orq %rdi, %rdx
+; AVX512VL-NEXT: orq %rcx, %rdx
+; AVX512VL-NEXT: cmovnel %r10d, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: test_cttz_undef_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm0
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm1
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm1
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3
+; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %r10d
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vpandnq %zmm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
+; AVX512POPCNT-NEXT: orq %r9, %rsi
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rcx
+; AVX512POPCNT-NEXT: orq %rsi, %rcx
+; AVX512POPCNT-NEXT: orq %r8, %rdi
+; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rdx
+; AVX512POPCNT-NEXT: orq %rdi, %rdx
+; AVX512POPCNT-NEXT: orq %rcx, %rdx
+; AVX512POPCNT-NEXT: cmovnel %r10d, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
+ %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1)
+ %res = trunc i1024 %cnt to i32
+ ret i32 %res
+}
+
+define i32 @load_cttz_undef_i1024(ptr %p0) nounwind {
+; SSE-LABEL: load_cttz_undef_i1024:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq 72(%rdi), %rbx
+; SSE-NEXT: movq 56(%rdi), %r9
+; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 48(%rdi), %rcx
+; SSE-NEXT: movq 40(%rdi), %r10
+; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 32(%rdi), %rsi
+; SSE-NEXT: movq 24(%rdi), %rbp
+; SSE-NEXT: movq (%rdi), %r8
+; SSE-NEXT: movq 8(%rdi), %r11
+; SSE-NEXT: rep bsfq %r8, %rax
+; SSE-NEXT: rep bsfq %r11, %rdx
+; SSE-NEXT: addl $64, %edx
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %eax, %edx
+; SSE-NEXT: movq 16(%rdi), %r14
+; SSE-NEXT: rep bsfq %r14, %r15
+; SSE-NEXT: rep bsfq %rbp, %rax
+; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %r14, %r14
+; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: movq %r8, %r15
+; SSE-NEXT: orq %r11, %r15
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: rep bsfq %rsi, %rdx
+; SSE-NEXT: rep bsfq %r10, %r13
+; SSE-NEXT: addl $64, %r13d
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: cmovnel %edx, %r13d
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: rep bsfq %rcx, %rdx
+; SSE-NEXT: rep bsfq %r9, %r15
+; SSE-NEXT: addl $64, %r15d
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %edx, %r15d
+; SSE-NEXT: movq 64(%rdi), %r12
+; SSE-NEXT: subl $-128, %r15d
+; SSE-NEXT: movq %rsi, %rdx
+; SSE-NEXT: orq %r10, %rdx
+; SSE-NEXT: cmovnel %r13d, %r15d
+; SSE-NEXT: addl $256, %r15d # imm = 0x100
+; SSE-NEXT: movq %r11, %rdx
+; SSE-NEXT: orq %rbp, %rdx
+; SSE-NEXT: movq %r8, %r13
+; SSE-NEXT: orq %r14, %r13
+; SSE-NEXT: orq %rdx, %r13
+; SSE-NEXT: cmovnel %eax, %r15d
+; SSE-NEXT: rep bsfq %r12, %rdx
+; SSE-NEXT: rep bsfq %rbx, %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %r12, %r12
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: movq 88(%rdi), %rbp
+; SSE-NEXT: rep bsfq %rbp, %r13
+; SSE-NEXT: addl $64, %r13d
+; SSE-NEXT: movq 80(%rdi), %r10
+; SSE-NEXT: rep bsfq %r10, %rcx
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %ecx, %r13d
+; SSE-NEXT: subl $-128, %r13d
+; SSE-NEXT: movq %r12, %rcx
+; SSE-NEXT: orq %rbx, %rcx
+; SSE-NEXT: cmovnel %eax, %r13d
+; SSE-NEXT: movq 104(%rdi), %r9
+; SSE-NEXT: rep bsfq %r9, %rcx
+; SSE-NEXT: addl $64, %ecx
+; SSE-NEXT: movq 96(%rdi), %rdx
+; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %eax, %ecx
+; SSE-NEXT: rep bsfq 120(%rdi), %rax
+; SSE-NEXT: movq 112(%rdi), %rdi
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: rep bsfq %rdi, %rsi
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %r9, %rdx
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: orq %rbp, %rbx
+; SSE-NEXT: orq %r10, %r12
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: orq %rbx, %r12
+; SSE-NEXT: cmovnel %r13d, %eax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE-NEXT: orq %rcx, %r11
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE-NEXT: orq %r14, %r8
+; SSE-NEXT: addl $512, %eax # imm = 0x200
+; SSE-NEXT: orq %r11, %r8
+; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: load_cttz_undef_i1024:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq 72(%rdi), %r14
+; AVX2-NEXT: movq 64(%rdi), %r15
+; AVX2-NEXT: movq 56(%rdi), %r9
+; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 48(%rdi), %rcx
+; AVX2-NEXT: movq 40(%rdi), %r10
+; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 32(%rdi), %rsi
+; AVX2-NEXT: movq 24(%rdi), %rbp
+; AVX2-NEXT: movq 16(%rdi), %rbx
+; AVX2-NEXT: movq (%rdi), %r8
+; AVX2-NEXT: movq 8(%rdi), %r11
+; AVX2-NEXT: tzcntq %r8, %rax
+; AVX2-NEXT: tzcntq %r11, %rdx
+; AVX2-NEXT: addl $64, %edx
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %eax, %edx
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: tzcntq %rbx, %r12
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %rbp, %rax
+; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %rbx, %rbx
+; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: movq %r8, %r12
+; AVX2-NEXT: orq %r11, %r12
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: tzcntq %rsi, %rdx
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: tzcntq %r10, %r13
+; AVX2-NEXT: addl $64, %r13d
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: cmovnel %edx, %r13d
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: tzcntq %rcx, %rdx
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: tzcntq %r9, %r12
+; AVX2-NEXT: addl $64, %r12d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %edx, %r12d
+; AVX2-NEXT: subl $-128, %r12d
+; AVX2-NEXT: movq %rsi, %rdx
+; AVX2-NEXT: orq %r10, %rdx
+; AVX2-NEXT: cmovnel %r13d, %r12d
+; AVX2-NEXT: addl $256, %r12d # imm = 0x100
+; AVX2-NEXT: movq %r11, %rdx
+; AVX2-NEXT: orq %rbp, %rdx
+; AVX2-NEXT: movq %r8, %r13
+; AVX2-NEXT: orq %rbx, %r13
+; AVX2-NEXT: orq %rdx, %r13
+; AVX2-NEXT: cmovnel %eax, %r12d
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: tzcntq %r15, %rdx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %r14, %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: testq %r15, %r15
+; AVX2-NEXT: cmovnel %edx, %eax
+; AVX2-NEXT: movq 88(%rdi), %rbp
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: tzcntq %rbp, %r13
+; AVX2-NEXT: addl $64, %r13d
+; AVX2-NEXT: movq 80(%rdi), %r10
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: tzcntq %r10, %rcx
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %ecx, %r13d
+; AVX2-NEXT: subl $-128, %r13d
+; AVX2-NEXT: movq %r15, %rcx
+; AVX2-NEXT: orq %r14, %rcx
+; AVX2-NEXT: cmovnel %eax, %r13d
+; AVX2-NEXT: movq 104(%rdi), %r9
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: tzcntq %r9, %rcx
+; AVX2-NEXT: addl $64, %ecx
+; AVX2-NEXT: movq 96(%rdi), %rdx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %rdx, %rax
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %eax, %ecx
+; AVX2-NEXT: movq 112(%rdi), %rsi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq 120(%rdi), %rax
+; AVX2-NEXT: addl $64, %eax
+; AVX2-NEXT: tzcntq %rsi, %rdi
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %r9, %rdx
+; AVX2-NEXT: cmovnel %ecx, %eax
+; AVX2-NEXT: orq %rbp, %r14
+; AVX2-NEXT: orq %r10, %r15
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %r14, %r15
+; AVX2-NEXT: cmovnel %r13d, %eax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; AVX2-NEXT: orq %rcx, %r11
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT: orq %rbx, %r8
+; AVX2-NEXT: addl $512, %eax # imm = 0x200
+; AVX2-NEXT: orq %r11, %r8
+; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: load_cttz_undef_i1024:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1
+; AVX512F-NEXT: movq 16(%rdi), %rax
+; AVX512F-NEXT: movq (%rdi), %rcx
+; AVX512F-NEXT: movq 8(%rdi), %rdx
+; AVX512F-NEXT: movq 24(%rdi), %rsi
+; AVX512F-NEXT: orq 56(%rdi), %rsi
+; AVX512F-NEXT: orq 40(%rdi), %rdx
+; AVX512F-NEXT: orq 48(%rdi), %rax
+; AVX512F-NEXT: orq %rsi, %rdx
+; AVX512F-NEXT: orq 32(%rdi), %rcx
+; AVX512F-NEXT: orq %rax, %rcx
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm3
+; AVX512F-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT: vplzcntq %zmm3, %zmm3
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm1, %esi
+; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm1
+; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vpsubq %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: addl $512, %eax # imm = 0x200
+; AVX512F-NEXT: orq %rdx, %rcx
+; AVX512F-NEXT: cmovnel %esi, %eax
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: load_cttz_undef_i1024:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm1
+; AVX512VL-NEXT: movq 16(%rdi), %rax
+; AVX512VL-NEXT: movq (%rdi), %rcx
+; AVX512VL-NEXT: movq 8(%rdi), %rdx
+; AVX512VL-NEXT: movq 24(%rdi), %rsi
+; AVX512VL-NEXT: orq 56(%rdi), %rsi
+; AVX512VL-NEXT: orq 40(%rdi), %rdx
+; AVX512VL-NEXT: orq 48(%rdi), %rax
+; AVX512VL-NEXT: orq 32(%rdi), %rcx
+; AVX512VL-NEXT: orq %rsi, %rdx
+; AVX512VL-NEXT: orq %rax, %rcx
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm3
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm1, %esi
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT: vpsubq %zmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
+; AVX512VL-NEXT: orq %rdx, %rcx
+; AVX512VL-NEXT: cmovnel %esi, %eax
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512POPCNT-LABEL: load_cttz_undef_i1024:
+; AVX512POPCNT: # %bb.0:
+; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm1
+; AVX512POPCNT-NEXT: movq 16(%rdi), %rax
+; AVX512POPCNT-NEXT: movq (%rdi), %rcx
+; AVX512POPCNT-NEXT: movq 8(%rdi), %rdx
+; AVX512POPCNT-NEXT: movq 24(%rdi), %rsi
+; AVX512POPCNT-NEXT: orq 56(%rdi), %rsi
+; AVX512POPCNT-NEXT: orq 40(%rdi), %rdx
+; AVX512POPCNT-NEXT: orq 48(%rdi), %rax
+; AVX512POPCNT-NEXT: orq 32(%rdi), %rcx
+; AVX512POPCNT-NEXT: orq %rsi, %rdx
+; AVX512POPCNT-NEXT: orq %rax, %rcx
+; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3
+; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %esi
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
+; AVX512POPCNT-NEXT: orq %rdx, %rcx
+; AVX512POPCNT-NEXT: cmovnel %esi, %eax
+; AVX512POPCNT-NEXT: vzeroupper
+; AVX512POPCNT-NEXT: retq
%a0 = load i1024, ptr %p0
- %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 0)
+ %cnt = call i1024 @llvm.cttz.i1024(i1024 %a0, i1 -1)
%res = trunc i1024 %cnt to i32
ret i32 %res
}
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 19d751d1..023fb506 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE,SSE4
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
@@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %edx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB5_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: .LBB5_2:
-; X86-NEXT: andl 4(%eax), %esi
-; X86-NEXT: andl (%eax), %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: setne %al
-; X86-NEXT: popl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $32, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_ne_i64:
@@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB6_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB6_2:
-; X86-NEXT: movl (%edx), %ecx
-; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: andl %esi, %ebx
-; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: andl %eax, %ebp
-; X86-NEXT: xorl %esi, %edi
-; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: orl %ebx, %ebp
-; X86-NEXT: setne %al
-; X86-NEXT: movl %ecx, (%edx)
-; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btcl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i64:
@@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %esi
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB7_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: .LBB7_2:
-; X86-NEXT: movl (%edx), %eax
-; X86-NEXT: movl 4(%edx), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: notl %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: andl %esi, %ebp
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %ecx, %edi
-; X86-NEXT: andl %eax, %esi
-; X86-NEXT: orl %ebx, %ebp
-; X86-NEXT: sete %al
-; X86-NEXT: movl %esi, (%edx)
-; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i64:
@@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB8_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB8_2:
-; X86-NEXT: movl (%edx), %ecx
-; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: andl %esi, %ebx
-; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: andl %eax, %ebp
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: orl %ebx, %ebp
-; X86-NEXT: setne %al
-; X86-NEXT: movl %ecx, (%edx)
-; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btsl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i64:
@@ -419,52 +353,26 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB9_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl $0, %eax
-; X86-NEXT: .LBB9_2:
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: notl %ebp
-; X86-NEXT: je .LBB9_4
-; X86-NEXT: # %bb.3:
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: .LBB9_4:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 4(%ecx), %ecx
-; X86-NEXT: andl %ecx, %edx
-; X86-NEXT: andl %ecx, %ebx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl (%edi), %ecx
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: andl %ecx, %ebp
-; X86-NEXT: orl %esi, %ebp
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %ebp, (%edi)
-; X86-NEXT: movl %ebx, 4(%edi)
-; X86-NEXT: sete %al
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i64:
@@ -516,101 +424,25 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $48, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, (%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %esi
-; X86-NEXT: movl 24(%esp,%esi), %edi
-; X86-NEXT: movl 28(%esp,%esi), %eax
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl 16(%esp,%esi), %edx
-; X86-NEXT: movl 20(%esp,%esi), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: andl 8(%ebx), %edi
-; X86-NEXT: andl (%ebx), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: andl 12(%ebx), %eax
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $96, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
-; SSE-LABEL: test_ne_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %rsi, %rax
-; SSE-NEXT: andq 8(%rdi), %rdx
-; SSE-NEXT: andq (%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: setne %al
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: test_ne_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %edx
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %rdx, %rsi
-; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rdx, %rsi
-; AVX2-NEXT: cmovneq %rax, %rdx
-; AVX2-NEXT: andq 8(%rdi), %rsi
-; AVX2-NEXT: andq (%rdi), %rdx
-; AVX2-NEXT: orq %rsi, %rdx
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_ne_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %eax
-; AVX512-NEXT: xorl %edx, %edx
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shlxq %rcx, %rax, %rax
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rax, %rdx
-; AVX512-NEXT: cmovneq %rsi, %rax
-; AVX512-NEXT: andq 8(%rdi), %rdx
-; AVX512-NEXT: andq (%rdi), %rax
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: retq
+; X64-LABEL: test_ne_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $96, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: movl (%rdi,%rax), %eax
+; X64-NEXT: btl %esi, %eax
+; X64-NEXT: setb %al
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -623,124 +455,33 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %esi
-; X86-NEXT: movl 60(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %edi
-; X86-NEXT: movl 52(%esp,%eax), %ebx
-; X86-NEXT: shldl %cl, %ebx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl 8(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: andl %edi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 12(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %esi, 12(%eax)
-; X86-NEXT: movl %edi, (%eax)
-; X86-NEXT: movl %ebx, 4(%eax)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btcl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: complement_ne_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %edx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rdx, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: andq %rsi, %r8
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: andq %rdx, %r9
-; SSE-NEXT: xorq %rcx, %rsi
-; SSE-NEXT: xorq %rax, %rdx
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: setne %al
-; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: movq %rsi, 8(%rdi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: complement_ne_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: movl $1, %edx
-; AVX-NEXT: xorl %esi, %esi
-; AVX-NEXT: shldq %cl, %rdx, %rsi
-; AVX-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %rdx, %rsi
-; AVX-NEXT: cmovneq %rax, %rdx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: movq %rcx, %r8
-; AVX-NEXT: andq %rsi, %r8
-; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: andq %rdx, %r9
-; AVX-NEXT: xorq %rcx, %rsi
-; AVX-NEXT: xorq %rax, %rdx
-; AVX-NEXT: orq %r8, %r9
-; AVX-NEXT: setne %al
-; AVX-NEXT: movq %rdx, (%rdi)
-; AVX-NEXT: movq %rsi, 8(%rdi)
-; AVX-NEXT: retq
+; X64-LABEL: complement_ne_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btcl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -755,124 +496,33 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %edx
-; X86-NEXT: movl 60(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %esi
-; X86-NEXT: movl 52(%esp,%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %edx
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl 8(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl (%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: movl 4(%ebx), %ebx
-; X86-NEXT: andl %ebx, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl %ebx, %ecx
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl 8(%ebp), %edi
-; X86-NEXT: movl %edx, 8(%edi)
-; X86-NEXT: movl %eax, 12(%edi)
-; X86-NEXT: movl %esi, (%edi)
-; X86-NEXT: movl %ecx, 4(%edi)
-; X86-NEXT: sete %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_eq_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %edx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rdx, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: andq %rsi, %r8
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: andq %rdx, %r9
-; SSE-NEXT: notq %rdx
-; SSE-NEXT: andq %rcx, %rsi
-; SSE-NEXT: andq %rax, %rdx
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: sete %al
-; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: movq %rsi, 8(%rdi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: reset_eq_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: movl $1, %edx
-; AVX-NEXT: xorl %esi, %esi
-; AVX-NEXT: shldq %cl, %rdx, %rsi
-; AVX-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %rdx, %rsi
-; AVX-NEXT: cmovneq %rax, %rdx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: andnq %rcx, %rsi, %r8
-; AVX-NEXT: andq %rsi, %rcx
-; AVX-NEXT: andnq %rax, %rdx, %rsi
-; AVX-NEXT: andq %rdx, %rax
-; AVX-NEXT: orq %rcx, %rax
-; AVX-NEXT: sete %al
-; AVX-NEXT: movq %rsi, (%rdi)
-; AVX-NEXT: movq %r8, 8(%rdi)
-; AVX-NEXT: retq
+; X64-LABEL: reset_eq_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setae %al
+; X64-NEXT: btrl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -888,124 +538,33 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %esi
-; X86-NEXT: movl 60(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %edi
-; X86-NEXT: movl 52(%esp,%eax), %ebx
-; X86-NEXT: shldl %cl, %ebx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl 8(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: andl %edi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 12(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %esi, 12(%eax)
-; X86-NEXT: movl %edi, (%eax)
-; X86-NEXT: movl %ebx, 4(%eax)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btsl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: set_ne_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %edx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rdx, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: andq %rsi, %r8
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: andq %rdx, %r9
-; SSE-NEXT: orq %rcx, %rsi
-; SSE-NEXT: orq %rax, %rdx
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: setne %al
-; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: movq %rsi, 8(%rdi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: set_ne_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: movl $1, %edx
-; AVX-NEXT: xorl %esi, %esi
-; AVX-NEXT: shldq %cl, %rdx, %rsi
-; AVX-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %rdx, %rsi
-; AVX-NEXT: cmovneq %rax, %rdx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: movq %rcx, %r8
-; AVX-NEXT: andq %rsi, %r8
-; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: andq %rdx, %r9
-; AVX-NEXT: orq %rcx, %rsi
-; AVX-NEXT: orq %rax, %rdx
-; AVX-NEXT: orq %r8, %r9
-; AVX-NEXT: setne %al
-; AVX-NEXT: movq %rdx, (%rdi)
-; AVX-NEXT: movq %rsi, 8(%rdi)
-; AVX-NEXT: retq
+; X64-LABEL: set_ne_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btsl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -1020,218 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movzbl 16(%ebp), %eax
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrb $3, %dl
-; X86-NEXT: andb $12, %dl
-; X86-NEXT: negb %dl
-; X86-NEXT: movsbl %dl, %esi
-; X86-NEXT: movl 64(%esp,%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 68(%esp,%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esp,%esi), %ebx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 76(%esp,%esi), %edi
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %edi, %esi
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl 12(%ecx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl 4(%ecx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl 100(%esp,%ecx), %edi
-; X86-NEXT: movl 104(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl 108(%esp,%ebx), %ebx
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl 96(%esp,%ebx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 8(%ecx)
-; X86-NEXT: movl %esi, 12(%ecx)
-; X86-NEXT: movl %eax, (%ecx)
-; X86-NEXT: movl %edx, 4(%ecx)
-; X86-NEXT: sete %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: movl %edx, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: xorl %r9d, %r9d
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %r9, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %r9, %rax
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq 8(%rdi), %r9
-; SSE-NEXT: movq %r9, %r10
-; SSE-NEXT: andq %r8, %r10
-; SSE-NEXT: notq %r8
-; SSE-NEXT: movq %rcx, %r11
-; SSE-NEXT: andq %rsi, %r11
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: andq %r9, %r8
-; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq %rcx, %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: orq %r10, %r11
-; SSE-NEXT: sete %al
-; SSE-NEXT: movq %rsi, (%rdi)
-; SSE-NEXT: movq %r8, 8(%rdi)
+; SSE-NEXT: andl $96, %esi
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: movl (%rdi,%rsi), %r8d
+; SSE-NEXT: btl %ecx, %r8d
+; SSE-NEXT: setae %al
+; SSE-NEXT: shll %cl, %edx
+; SSE-NEXT: btrl %ecx, %r8d
+; SSE-NEXT: orl %r8d, %edx
+; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: init_eq_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %esi
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: shldq %cl, %rsi, %rax
-; AVX2-NEXT: xorl %r8d, %r8d
-; AVX2-NEXT: movl %edx, %edx
-; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: shldq %cl, %rdx, %r9
-; AVX2-NEXT: shlxq %rcx, %rsi, %rsi
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rsi, %rax
-; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT: cmovneq %rcx, %r9
-; AVX2-NEXT: cmovneq %r8, %rcx
-; AVX2-NEXT: movq (%rdi), %rdx
-; AVX2-NEXT: movq 8(%rdi), %r8
-; AVX2-NEXT: andnq %r8, %rax, %r10
-; AVX2-NEXT: andq %rax, %r8
-; AVX2-NEXT: andnq %rdx, %rsi, %r11
-; AVX2-NEXT: andq %rsi, %rdx
-; AVX2-NEXT: orq %r9, %r10
-; AVX2-NEXT: orq %rcx, %r11
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: movq %r11, (%rdi)
-; AVX2-NEXT: movq %r10, 8(%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: movl $1, %esi
-; AVX512-NEXT: xorl %r8d, %r8d
-; AVX512-NEXT: shldq %cl, %rsi, %r8
-; AVX512-NEXT: shlxq %rcx, %rsi, %rsi
-; AVX512-NEXT: movl %edx, %edx
-; AVX512-NEXT: xorl %r9d, %r9d
-; AVX512-NEXT: shldq %cl, %rdx, %r9
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rsi, %r8
-; AVX512-NEXT: cmovneq %rax, %rsi
-; AVX512-NEXT: shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT: cmovneq %rcx, %r9
-; AVX512-NEXT: cmovneq %rax, %rcx
-; AVX512-NEXT: movq (%rdi), %rax
-; AVX512-NEXT: movq 8(%rdi), %rdx
-; AVX512-NEXT: andnq %rdx, %r8, %r10
-; AVX512-NEXT: andq %r8, %rdx
-; AVX512-NEXT: andnq %rax, %rsi, %r8
-; AVX512-NEXT: andq %rsi, %rax
-; AVX512-NEXT: orq %r9, %r10
-; AVX512-NEXT: orq %rcx, %r8
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: movq %r8, (%rdi)
-; AVX512-NEXT: movq %r10, 8(%rdi)
-; AVX512-NEXT: retq
+; AVX-LABEL: init_eq_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: andl $96, %ecx
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: movl (%rdi,%rcx), %r8d
+; AVX-NEXT: btl %esi, %r8d
+; AVX-NEXT: setae %al
+; AVX-NEXT: btrl %esi, %r8d
+; AVX-NEXT: shlxl %esi, %edx, %edx
+; AVX-NEXT: orl %r8d, %edx
+; AVX-NEXT: movl %edx, (%rdi,%rcx)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -1252,935 +648,317 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: andl $60, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_ne_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: andl $60, %eax
+; X64-NEXT: movl (%rdi,%rax), %eax
+; X64-NEXT: btl %esi, %eax
+; X64-NEXT: setb %al
+; X64-NEXT: retq
+ %rem = and i32 %position, 511
+ %ofs = zext nneg i32 %rem to i512
+ %bit = shl nuw i512 1, %ofs
+ %ld = load i512, ptr %word
+ %test = and i512 %ld, %bit
+ %cmp = icmp ne i512 %test, 0
+ ret i1 %cmp
+}
+
+define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_ne_i512:
+; X86: # %bb.0:
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $224, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edx), %eax
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edx), %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 52(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl 40(%ebx), %eax
-; X86-NEXT: andl 8(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 56(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 24(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: andl 44(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 12(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 60(%edi), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 28(%edi), %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%edx), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: negl %edx
-; X86-NEXT: movl 192(%esp,%edx), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl 32(%ebx), %ecx
-; X86-NEXT: andl (%ebx), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: andl 16(%ebx), %edi
-; X86-NEXT: andl 48(%ebx), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 36(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 20(%ebx), %ecx
-; X86-NEXT: andl 52(%ebx), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btcl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: test_ne_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rbx
-; SSE-NEXT: movq -48(%rsp,%rbx), %rdx
-; SSE-NEXT: movq -40(%rsp,%rbx), %r14
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq -16(%rsp,%rbx), %r11
-; SSE-NEXT: movq -8(%rsp,%rbx), %r10
-; SSE-NEXT: shldq %cl, %r11, %r10
-; SSE-NEXT: movq -32(%rsp,%rbx), %r9
-; SSE-NEXT: movq -24(%rsp,%rbx), %r15
-; SSE-NEXT: movq %r15, %r8
-; SSE-NEXT: shldq %cl, %r9, %r8
-; SSE-NEXT: movq -56(%rsp,%rbx), %rsi
-; SSE-NEXT: shldq %cl, %rsi, %rdx
-; SSE-NEXT: shldq %cl, %r15, %r11
-; SSE-NEXT: shldq %cl, %r14, %r9
-; SSE-NEXT: movq -64(%rsp,%rbx), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %rsi
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rbx
-; SSE-NEXT: andq 32(%rdi), %r9
-; SSE-NEXT: andq 48(%rdi), %r11
-; SSE-NEXT: andq 16(%rdi), %rdx
-; SSE-NEXT: orq %r11, %rdx
-; SSE-NEXT: andq 40(%rdi), %r8
-; SSE-NEXT: andq 56(%rdi), %r10
-; SSE-NEXT: andq 24(%rdi), %rax
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: andq (%rdi), %rbx
-; SSE-NEXT: orq %r9, %rbx
-; SSE-NEXT: orq %rdx, %rbx
-; SSE-NEXT: andq 8(%rdi), %rsi
-; SSE-NEXT: orq %r8, %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: orq %rbx, %rsi
-; SSE-NEXT: setne %al
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: retq
+; X64-LABEL: complement_ne_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $60, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btcl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
+ %rem = and i32 %position, 511
+ %ofs = zext nneg i32 %rem to i512
+ %bit = shl nuw i512 1, %ofs
+ %ld = load i512, ptr %word
+ %test = and i512 %ld, %bit
+ %res = xor i512 %ld, %bit
+ %cmp = icmp ne i512 %test, 0
+ store i512 %res, ptr %word
+ ret i1 %cmp
+}
+
+define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
+; X86-LABEL: reset_eq_i512:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
;
-; AVX2-LABEL: test_ne_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rsi
-; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx
-; AVX2-NEXT: movq %rbx, %rax
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq -16(%rsp,%rsi), %r11
-; AVX2-NEXT: movq -8(%rsp,%rsi), %r10
-; AVX2-NEXT: shldq %cl, %r11, %r10
-; AVX2-NEXT: movq -32(%rsp,%rsi), %r9
-; AVX2-NEXT: movq -24(%rsp,%rsi), %r14
-; AVX2-NEXT: movq %r14, %r8
-; AVX2-NEXT: shldq %cl, %r9, %r8
-; AVX2-NEXT: movq -64(%rsp,%rsi), %r15
-; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi
-; AVX2-NEXT: shldq %cl, %rsi, %rdx
-; AVX2-NEXT: shldq %cl, %r14, %r11
-; AVX2-NEXT: shldq %cl, %rbx, %r9
-; AVX2-NEXT: shldq %cl, %r15, %rsi
-; AVX2-NEXT: shlxq %rcx, %r15, %rcx
-; AVX2-NEXT: andq 32(%rdi), %r9
-; AVX2-NEXT: andq 48(%rdi), %r11
-; AVX2-NEXT: andq 16(%rdi), %rdx
-; AVX2-NEXT: andq 40(%rdi), %r8
-; AVX2-NEXT: andq 56(%rdi), %r10
-; AVX2-NEXT: andq 24(%rdi), %rax
-; AVX2-NEXT: orq %r11, %rdx
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: andq (%rdi), %rcx
-; AVX2-NEXT: orq %r9, %rcx
-; AVX2-NEXT: orq %rdx, %rcx
-; AVX2-NEXT: andq 8(%rdi), %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: orq %rax, %rsi
-; AVX2-NEXT: orq %rcx, %rsi
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; X64-LABEL: reset_eq_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $60, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setae %al
+; X64-NEXT: btrl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
+ %rem = and i32 %position, 511
+ %ofs = zext nneg i32 %rem to i512
+ %bit = shl nuw i512 1, %ofs
+ %mask = xor i512 %bit, -1
+ %ld = load i512, ptr %word
+ %test = and i512 %ld, %bit
+ %res = and i512 %ld, %mask
+ %cmp = icmp eq i512 %test, 0
+ store i512 %res, ptr %word
+ ret i1 %cmp
+}
+
+define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
+; X86-LABEL: set_ne_i512:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btsl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
;
-; AVX512-LABEL: test_ne_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx
-; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %rax
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq -16(%rsp,%rbx), %r11
-; AVX512-NEXT: movq -8(%rsp,%rbx), %r10
-; AVX512-NEXT: shldq %cl, %r11, %r10
-; AVX512-NEXT: movq -32(%rsp,%rbx), %r9
-; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT: movq %r15, %r8
-; AVX512-NEXT: shldq %cl, %r9, %r8
-; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi
-; AVX512-NEXT: shldq %cl, %rsi, %rdx
-; AVX512-NEXT: shldq %cl, %r15, %r11
-; AVX512-NEXT: shldq %cl, %r14, %r9
-; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT: shldq %cl, %rbx, %rsi
-; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT: andq 32(%rdi), %r9
-; AVX512-NEXT: andq 48(%rdi), %r11
-; AVX512-NEXT: andq 16(%rdi), %rdx
-; AVX512-NEXT: andq 40(%rdi), %r8
-; AVX512-NEXT: andq 56(%rdi), %r10
-; AVX512-NEXT: andq 24(%rdi), %rax
-; AVX512-NEXT: orq %r11, %rdx
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: andq (%rdi), %rcx
-; AVX512-NEXT: orq %r9, %rcx
-; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: andq 8(%rdi), %rsi
-; AVX512-NEXT: orq %r8, %rsi
-; AVX512-NEXT: orq %rax, %rsi
-; AVX512-NEXT: orq %rcx, %rsi
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: set_ne_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $60, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btsl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
%ld = load i512, ptr %word
%test = and i512 %ld, %bit
+ %res = or i512 %ld, %bit
%cmp = icmp ne i512 %test, 0
+ store i512 %res, ptr %word
ret i1 %cmp
}
-define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
-; X86-LABEL: complement_ne_i512:
+define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
+; X86-LABEL: init_eq_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $272, %esp # imm = 0x110
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edx), %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edx), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 52(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl 56(%edx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: movl 24(%edx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%eax), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 12(%eax), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl 60(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 28(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: negl %eax
-; X86-NEXT: movl 240(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl 32(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: movl (%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 16(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: movl 48(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 36(%esi), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl %esi, %edi
-; X86-NEXT: movl 52(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl %ebx, 60(%edx)
-; X86-NEXT: movl %edi, 56(%edx)
-; X86-NEXT: movl %ecx, 52(%edx)
-; X86-NEXT: movl %esi, 44(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 40(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 36(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 32(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 28(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 24(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 20(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 16(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 8(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 4(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 48(%edx)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: complement_ne_i512:
+; SSE-LABEL: init_eq_i512:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rbx
-; SSE-NEXT: movq (%rsp,%rbx), %rsi
-; SSE-NEXT: movq 8(%rsp,%rbx), %r14
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: shldq %cl, %rsi, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 32(%rsp,%rbx), %r8
-; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT: shldq %cl, %r8, %rbp
-; SSE-NEXT: movq 16(%rsp,%rbx), %r9
-; SSE-NEXT: movq 24(%rsp,%rbx), %r15
-; SSE-NEXT: movq %r15, %r10
-; SSE-NEXT: shldq %cl, %r9, %r10
-; SSE-NEXT: movq -8(%rsp,%rbx), %r11
-; SSE-NEXT: shldq %cl, %r11, %rsi
-; SSE-NEXT: shldq %cl, %r15, %r8
-; SSE-NEXT: shldq %cl, %r14, %r9
-; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %r11
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rbx
-; SSE-NEXT: movq 24(%rdi), %r15
-; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 16(%rdi), %r12
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %r8, %r13
-; SSE-NEXT: andq %rsi, %r12
-; SSE-NEXT: orq %r13, %r12
-; SSE-NEXT: movq %rcx, %r13
-; SSE-NEXT: andq %rbp, %r13
-; SSE-NEXT: andq %rax, %r15
-; SSE-NEXT: orq %r13, %r15
-; SSE-NEXT: movq 32(%rdi), %r14
-; SSE-NEXT: movq %r14, %rcx
-; SSE-NEXT: andq %r9, %rcx
-; SSE-NEXT: movq (%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rbx, %r13
-; SSE-NEXT: orq %rcx, %r13
-; SSE-NEXT: orq %r12, %r13
-; SSE-NEXT: movq 40(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r12
-; SSE-NEXT: andq %r10, %r12
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movq %rdx, %rax
-; SSE-NEXT: andq %r11, %rax
-; SSE-NEXT: orq %r12, %rax
-; SSE-NEXT: orq %r15, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT: xorq %rcx, %r10
-; SSE-NEXT: xorq %r14, %r9
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT: xorq %rdx, %r11
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: movq %r8, 48(%rdi)
-; SSE-NEXT: movq %rbp, 56(%rdi)
-; SSE-NEXT: movq %r9, 32(%rdi)
-; SSE-NEXT: movq %r10, 40(%rdi)
-; SSE-NEXT: movq %rsi, 16(%rdi)
-; SSE-NEXT: movq %r15, 24(%rdi)
-; SSE-NEXT: movq %rbx, (%rdi)
-; SSE-NEXT: movq %r11, 8(%rdi)
-; SSE-NEXT: setne %al
-; SSE-NEXT: addq $56, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: andl $60, %esi
+; SSE-NEXT: movl (%rdi,%rsi), %r8d
+; SSE-NEXT: btl %ecx, %r8d
+; SSE-NEXT: setae %al
+; SSE-NEXT: shll %cl, %edx
+; SSE-NEXT: btrl %ecx, %r8d
+; SSE-NEXT: orl %r8d, %edx
+; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: complement_ne_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $72, %rsp
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, (%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rbx
-; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT: movq %rbp, %rax
-; AVX2-NEXT: shldq %cl, %rsi, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT: shldq %cl, %r8, %r13
-; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: shldq %cl, %r9, %r10
-; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT: shldq %cl, %r11, %rsi
-; AVX2-NEXT: shldq %cl, %r14, %r8
-; AVX2-NEXT: movq 16(%rdi), %r12
-; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r8, %r14
-; AVX2-NEXT: andq %rsi, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq 56(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r13, %r15
-; AVX2-NEXT: movq 24(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %rax, %r14
-; AVX2-NEXT: orq %r15, %r14
-; AVX2-NEXT: shldq %cl, %rbp, %r9
-; AVX2-NEXT: movq (%rsp,%rbx), %rdx
-; AVX2-NEXT: movq 32(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r9, %r15
-; AVX2-NEXT: shlxq %rcx, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq (%rdi), %rbx
-; AVX2-NEXT: movq %rbx, %rbp
-; AVX2-NEXT: andq %rax, %rbp
-; AVX2-NEXT: orq %r15, %rbp
-; AVX2-NEXT: orq %r12, %rbp
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %rdx, %r11
-; AVX2-NEXT: movq 40(%rdi), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: andq %r10, %rcx
-; AVX2-NEXT: movq 8(%rdi), %r15
-; AVX2-NEXT: movq %r15, %r12
-; AVX2-NEXT: andq %r11, %r12
-; AVX2-NEXT: orq %rcx, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT: xorq %rax, %r10
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: xorq %r15, %r11
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: movq %r8, 48(%rdi)
-; AVX2-NEXT: movq %r13, 56(%rdi)
-; AVX2-NEXT: movq %r9, 32(%rdi)
-; AVX2-NEXT: movq %r10, 40(%rdi)
-; AVX2-NEXT: movq %rsi, 16(%rdi)
-; AVX2-NEXT: movq %rcx, 24(%rdi)
-; AVX2-NEXT: movq %rbx, (%rdi)
-; AVX2-NEXT: movq %r11, 8(%rdi)
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: addq $72, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: complement_ne_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $72, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, (%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT: movq %rbp, %rax
-; AVX512-NEXT: shldq %cl, %rsi, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT: shldq %cl, %r8, %r13
-; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %r10
-; AVX512-NEXT: shldq %cl, %r9, %r10
-; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT: shldq %cl, %r11, %rsi
-; AVX512-NEXT: shldq %cl, %r14, %r8
-; AVX512-NEXT: movq 16(%rdi), %r12
-; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r8, %r14
-; AVX512-NEXT: andq %rsi, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq 56(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r13, %r15
-; AVX512-NEXT: movq 24(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %rax, %r14
-; AVX512-NEXT: orq %r15, %r14
-; AVX512-NEXT: shldq %cl, %rbp, %r9
-; AVX512-NEXT: movq (%rsp,%rbx), %rdx
-; AVX512-NEXT: movq 32(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r9, %r15
-; AVX512-NEXT: shlxq %rcx, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq (%rdi), %rbx
-; AVX512-NEXT: movq %rbx, %rbp
-; AVX512-NEXT: andq %rax, %rbp
-; AVX512-NEXT: orq %r15, %rbp
-; AVX512-NEXT: orq %r12, %rbp
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rdx, %r11
-; AVX512-NEXT: movq 40(%rdi), %rax
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andq %r10, %rcx
-; AVX512-NEXT: movq 8(%rdi), %r15
-; AVX512-NEXT: movq %r15, %r12
-; AVX512-NEXT: andq %r11, %r12
-; AVX512-NEXT: orq %rcx, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT: xorq %rax, %r10
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT: xorq %r15, %r11
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT: orq %rbp, %r12
-; AVX512-NEXT: movq %r8, 48(%rdi)
-; AVX512-NEXT: movq %r13, 56(%rdi)
-; AVX512-NEXT: movq %r9, 32(%rdi)
-; AVX512-NEXT: movq %r10, 40(%rdi)
-; AVX512-NEXT: movq %rsi, 16(%rdi)
-; AVX512-NEXT: movq %rcx, 24(%rdi)
-; AVX512-NEXT: movq %rbx, (%rdi)
-; AVX512-NEXT: movq %r11, 8(%rdi)
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: addq $72, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: init_eq_i512:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: andl $60, %ecx
+; AVX-NEXT: movl (%rdi,%rcx), %r8d
+; AVX-NEXT: btl %esi, %r8d
+; AVX-NEXT: setae %al
+; AVX-NEXT: btrl %esi, %r8d
+; AVX-NEXT: shlxl %esi, %edx, %edx
+; AVX-NEXT: orl %r8d, %edx
+; AVX-NEXT: movl %edx, (%rdi,%rcx)
+; AVX-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
+ %mask = xor i512 %bit, -1
+ %val0 = zext i1 %value to i512
+ %val = shl nuw i512 %val0, %ofs
%ld = load i512, ptr %word
%test = and i512 %ld, %bit
- %res = xor i512 %ld, %bit
- %cmp = icmp ne i512 %test, 0
+ %res0 = and i512 %ld, %mask
+ %res = or i512 %res0, %val
+ %cmp = icmp eq i512 %test, 0
store i512 %res, ptr %word
ret i1 %cmp
}
-define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
-; X86-LABEL: reset_eq_i512:
+; i4096
+
+define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
+; X86-LABEL: test_ne_i4096:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $4064, %edx # imm = 0xFE0
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_ne_i4096:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $4064, %eax # imm = 0xFE0
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: movl (%rdi,%rax), %eax
+; X64-NEXT: btl %esi, %eax
+; X64-NEXT: setb %al
+; X64-NEXT: retq
+ %rem = and i32 %position, 4095
+ %ofs = zext nneg i32 %rem to i4096
+ %bit = shl nuw i4096 1, %ofs
+ %ld = load i4096, ptr %word
+ %test = and i4096 %ld, %bit
+ %cmp = icmp ne i4096 %test, 0
+ ret i1 %cmp
+}
+
+; Special Cases
+
+; Multiple uses of the stored value
+define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_cmpz_i128:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: andl $96, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: xorl %edx, (%eax,%ecx)
+; X86-NEXT: movl (%eax), %ecx
+; X86-NEXT: movl 4(%eax), %edx
+; X86-NEXT: orl 12(%eax), %edx
+; X86-NEXT: orl 8(%eax), %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: setne %al
+; X86-NEXT: retl
+;
+; SSE-LABEL: complement_cmpz_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: shll %cl, %eax
+; SSE-NEXT: andl $96, %ecx
+; SSE-NEXT: shrl $3, %ecx
+; SSE-NEXT: xorl %eax, (%rdi,%rcx)
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: orq 8(%rdi), %rax
+; SSE-NEXT: setne %al
+; SSE-NEXT: retq
+;
+; AVX-LABEL: complement_cmpz_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX-NEXT: movl $1, %eax
+; AVX-NEXT: shlxl %esi, %eax, %eax
+; AVX-NEXT: andl $96, %esi
+; AVX-NEXT: shrl $3, %esi
+; AVX-NEXT: xorl %eax, (%rdi,%rsi)
+; AVX-NEXT: movq (%rdi), %rax
+; AVX-NEXT: orq 8(%rdi), %rax
+; AVX-NEXT: setne %al
+; AVX-NEXT: retq
+ %rem = and i32 %position, 127
+ %ofs = zext nneg i32 %rem to i128
+ %bit = shl nuw i128 1, %ofs
+ %ld = load i128, ptr %word
+ %res = xor i128 %ld, %bit
+ store i128 %res, ptr %word
+ %cmp = icmp ne i128 %res, 0
+ ret i1 %cmp
+}
+
+; Load hidden behind bitcast
+define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_ne_i128_bitcast:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
@@ -2188,614 +966,298 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $288, %esp # imm = 0x120
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-NEXT: subl %eax, %edi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 4(%edi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edi), %eax
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: shldl %cl, %edx, %ebx
-; X86-NEXT: movl 12(%edi), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edi), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edi), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edi), %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl 52(%edi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl 56(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%esi), %ecx
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movzwl (%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl 44(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esi), %ecx
+; X86-NEXT: movzwl 12(%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edi), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%edi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: negl %eax
-; X86-NEXT: movl 256(%esp,%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: movl 32(%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: movzwl 14(%eax), %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esi), %ecx
+; X86-NEXT: shll $16, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movzwl 2(%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movzwl 4(%eax), %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movzwl 6(%eax), %esi
+; X86-NEXT: movzwl 8(%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: andl %edi, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: movl 52(%ebx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzwl 10(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shll $16, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: andb $96, %bl
+; X86-NEXT: shrb $3, %bl
+; X86-NEXT: movzbl %bl, %edi
+; X86-NEXT: movl 32(%esp,%edi), %edi
+; X86-NEXT: btcl %eax, %edi
+; X86-NEXT: andl $96, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %edi, (%ecx,%eax)
+; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movw %dx, 14(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 60(%eax)
-; X86-NEXT: movl %esi, 56(%eax)
-; X86-NEXT: movl %ecx, 52(%eax)
+; X86-NEXT: movw %dx, 12(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 44(%eax)
+; X86-NEXT: movw %cx, 10(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 40(%eax)
+; X86-NEXT: movw %cx, 8(%eax)
+; X86-NEXT: movw %si, 6(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 36(%eax)
+; X86-NEXT: movw %cx, 4(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 32(%eax)
+; X86-NEXT: movw %cx, 2(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 28(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 24(%eax)
-; X86-NEXT: movl %ebx, 20(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 16(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 8(%eax)
-; X86-NEXT: movl %edi, 4(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 48(%eax)
-; X86-NEXT: sete %al
+; X86-NEXT: movw %cx, (%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+;
+; SSE2-LABEL: complement_ne_i128_bitcast:
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movq 8(%rdi), %rax
+; SSE2-NEXT: movq %xmm0, %rdx
+; SSE2-NEXT: movl %esi, %ecx
+; SSE2-NEXT: andb $32, %cl
+; SSE2-NEXT: shrdq %cl, %rax, %rdx
+; SSE2-NEXT: shrq %cl, %rax
+; SSE2-NEXT: testb $64, %sil
+; SSE2-NEXT: cmoveq %rdx, %rax
+; SSE2-NEXT: btcl %esi, %eax
+; SSE2-NEXT: andl $96, %esi
+; SSE2-NEXT: shrl $3, %esi
+; SSE2-NEXT: movl %eax, (%rdi,%rsi)
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: complement_ne_i128_bitcast:
+; SSE4: # %bb.0:
+; SSE4-NEXT: # kill: def $esi killed $esi def $rsi
+; SSE4-NEXT: movdqa (%rdi), %xmm0
+; SSE4-NEXT: pextrq $1, %xmm0, %rax
+; SSE4-NEXT: movq %xmm0, %rdx
+; SSE4-NEXT: movl %esi, %ecx
+; SSE4-NEXT: andb $32, %cl
+; SSE4-NEXT: shrdq %cl, %rax, %rdx
+; SSE4-NEXT: shrq %cl, %rax
+; SSE4-NEXT: testb $64, %sil
+; SSE4-NEXT: cmoveq %rdx, %rax
+; SSE4-NEXT: btcl %esi, %eax
+; SSE4-NEXT: andl $96, %esi
+; SSE4-NEXT: shrl $3, %esi
+; SSE4-NEXT: movl %eax, (%rdi,%rsi)
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: complement_ne_i128_bitcast:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: vmovq %xmm0, %rdx
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: andb $32, %cl
+; AVX-NEXT: shrdq %cl, %rax, %rdx
+; AVX-NEXT: shrxq %rcx, %rax, %rax
+; AVX-NEXT: testb $64, %sil
+; AVX-NEXT: cmoveq %rdx, %rax
+; AVX-NEXT: btcl %esi, %eax
+; AVX-NEXT: andl $96, %esi
+; AVX-NEXT: shrl $3, %esi
+; AVX-NEXT: movl %eax, (%rdi,%rsi)
+; AVX-NEXT: retq
+ %rem = and i32 %position, 127
+ %ofs = zext nneg i32 %rem to i128
+ %bit = shl nuw i128 1, %ofs
+ %ldv = load <8 x i16>, ptr %word
+ %ld = bitcast <8 x i16> %ldv to i128
+ %test = and i128 %ld, %bit
+ %res = xor i128 %ld, %bit
+ store i128 %res, ptr %word
+ ret <8 x i16> %ldv
+}
+
+; Multiple loads in store chain
+define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
+; X86-LABEL: reset_multiload_i128:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: btrl %edx, %ebx
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl %ebx, (%ecx,%esi)
+; X86-NEXT: jae .LBB23_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB23_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+;
+; X64-LABEL: reset_multiload_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %r9d
+; X64-NEXT: movl %r9d, %r8d
+; X64-NEXT: btrl %esi, %r8d
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: btl %esi, %r9d
+; X64-NEXT: jb .LBB23_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: .LBB23_2:
+; X64-NEXT: movl %r8d, (%rdi,%rcx)
+; X64-NEXT: retq
+ %rem = and i32 %position, 127
+ %ofs = zext nneg i32 %rem to i128
+ %bit = shl nuw i128 1, %ofs
+ %mask = xor i128 %bit, -1
+ %ld = load i128, ptr %word
+ %sel = load i32, ptr %p
+ %test = and i128 %ld, %bit
+ %res = and i128 %ld, %mask
+ %cmp = icmp eq i128 %test, 0
+ store i128 %res, ptr %word
+ %ret = select i1 %cmp, i32 %sel, i32 0
+ ret i32 %ret
+}
+
+; Multiple uses of the store chain AND stored value
+define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind {
+; X86-LABEL: chain_reset_i256:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $-2, %edi
+; X86-NEXT: roll %cl, %edi
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: andl $28, %ecx
+; X86-NEXT: andl %edi, (%esi,%ecx)
+; X86-NEXT: movl 8(%esi), %ebx
+; X86-NEXT: movl (%esi), %edi
+; X86-NEXT: movl 4(%esi), %ecx
+; X86-NEXT: movl 12(%esi), %ebp
+; X86-NEXT: orl 28(%esi), %ebp
+; X86-NEXT: orl 20(%esi), %ecx
+; X86-NEXT: orl %ebp, %ecx
+; X86-NEXT: orl 24(%esi), %ebx
+; X86-NEXT: movl 16(%esi), %ebp
+; X86-NEXT: orl %edi, %ebp
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: movl (%edx), %esi
+; X86-NEXT: movl %edi, (%edx)
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: orl %ecx, %ebp
+; X86-NEXT: jne .LBB24_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: .LBB24_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_eq_i512:
+; SSE-LABEL: chain_reset_i256:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rdx
-; SSE-NEXT: movq (%rsp,%rdx), %r9
-; SSE-NEXT: movq 8(%rsp,%rdx), %r8
-; SSE-NEXT: movq %r8, %rsi
-; SSE-NEXT: shldq %cl, %r9, %rsi
-; SSE-NEXT: movq -8(%rsp,%rdx), %rax
-; SSE-NEXT: shldq %cl, %rax, %r9
-; SSE-NEXT: movq 16(%rsp,%rdx), %r14
-; SSE-NEXT: movq 24(%rsp,%rdx), %r10
-; SSE-NEXT: movq %r10, %rbx
-; SSE-NEXT: shldq %cl, %r14, %rbx
-; SSE-NEXT: shldq %cl, %r8, %r14
-; SSE-NEXT: movq 32(%rsp,%rdx), %r13
-; SSE-NEXT: movq 40(%rsp,%rdx), %r12
-; SSE-NEXT: shldq %cl, %r13, %r12
-; SSE-NEXT: shldq %cl, %r10, %r13
-; SSE-NEXT: movq -16(%rsp,%rdx), %rdx
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq %r12, %rbp
-; SSE-NEXT: movq %r9, %r15
-; SSE-NEXT: movq %rsi, %r11
-; SSE-NEXT: movq 16(%rdi), %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %r13
-; SSE-NEXT: andq %r8, %r9
-; SSE-NEXT: orq %r13, %r9
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %r12
-; SSE-NEXT: movq 24(%rdi), %r10
-; SSE-NEXT: andq %r10, %rsi
-; SSE-NEXT: orq %r12, %rsi
-; SSE-NEXT: movq %r14, %r13
-; SSE-NEXT: movq 32(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %r14
-; SSE-NEXT: movq %rdx, %r12
+; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx
+; SSE-NEXT: movl $-2, %eax
+; SSE-NEXT: roll %cl, %eax
+; SSE-NEXT: shrl $3, %ecx
+; SSE-NEXT: andl $28, %ecx
+; SSE-NEXT: andl %eax, (%rdi,%rcx)
; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %rdx
-; SSE-NEXT: orq %r14, %rdx
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: movq %rbx, %r14
-; SSE-NEXT: movq 40(%rdi), %rcx
-; SSE-NEXT: andq %rcx, %rbx
-; SSE-NEXT: movq %rax, %r9
; SSE-NEXT: movq 8(%rdi), %r8
-; SSE-NEXT: andq %r8, %rax
-; SSE-NEXT: orq %rbx, %rax
-; SSE-NEXT: orq %rsi, %rax
-; SSE-NEXT: notq %r11
-; SSE-NEXT: andq %r10, %r11
-; SSE-NEXT: notq %r15
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: notq %r14
-; SSE-NEXT: andq %rcx, %r14
-; SSE-NEXT: notq %r13
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT: notq %rbp
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: notq %rcx
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT: notq %r9
-; SSE-NEXT: andq %r8, %r9
-; SSE-NEXT: notq %r12
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rcx, 48(%rdi)
-; SSE-NEXT: movq %rbp, 56(%rdi)
-; SSE-NEXT: movq %r13, 32(%rdi)
-; SSE-NEXT: movq %r14, 40(%rdi)
-; SSE-NEXT: movq %r15, 16(%rdi)
-; SSE-NEXT: movq %r11, 24(%rdi)
-; SSE-NEXT: movq %r12, (%rdi)
-; SSE-NEXT: movq %r9, 8(%rdi)
-; SSE-NEXT: sete %al
-; SSE-NEXT: addq $56, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: orq 24(%rdi), %r8
+; SSE-NEXT: movq 16(%rdi), %rdi
+; SSE-NEXT: orq %rcx, %rdi
+; SSE-NEXT: movl (%rsi), %eax
+; SSE-NEXT: movl %ecx, (%rsi)
+; SSE-NEXT: movl (%rdx), %ecx
+; SSE-NEXT: addl %ecx, %eax
+; SSE-NEXT: orq %r8, %rdi
+; SSE-NEXT: cmovnel %ecx, %eax
; SSE-NEXT: retq
;
-; AVX2-LABEL: reset_eq_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rdx
-; AVX2-NEXT: movq -48(%rsp,%rdx), %r8
-; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx
-; AVX2-NEXT: movq %rbx, %rax
-; AVX2-NEXT: shldq %cl, %r8, %rax
-; AVX2-NEXT: movq -16(%rsp,%rdx), %r10
-; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi
-; AVX2-NEXT: shldq %cl, %r10, %rsi
-; AVX2-NEXT: movq -32(%rsp,%rdx), %r11
-; AVX2-NEXT: movq -24(%rsp,%rdx), %r14
-; AVX2-NEXT: movq %r14, %r9
-; AVX2-NEXT: shldq %cl, %r11, %r9
-; AVX2-NEXT: movq -64(%rsp,%rdx), %r15
-; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx
-; AVX2-NEXT: shldq %cl, %rdx, %r8
-; AVX2-NEXT: shldq %cl, %r14, %r10
-; AVX2-NEXT: shldq %cl, %rbx, %r11
-; AVX2-NEXT: shldq %cl, %r15, %rdx
-; AVX2-NEXT: shlxq %rcx, %r15, %rcx
-; AVX2-NEXT: movq 24(%rdi), %rbx
-; AVX2-NEXT: movq 56(%rdi), %r14
-; AVX2-NEXT: movq 16(%rdi), %r15
-; AVX2-NEXT: movq 48(%rdi), %r13
-; AVX2-NEXT: movq 32(%rdi), %rbp
-; AVX2-NEXT: andnq %rbp, %r11, %r12
-; AVX2-NEXT: andq %r11, %rbp
-; AVX2-NEXT: andnq %r13, %r10, %r11
-; AVX2-NEXT: andq %r10, %r13
-; AVX2-NEXT: andnq %r15, %r8, %r10
-; AVX2-NEXT: andq %r8, %r15
-; AVX2-NEXT: movq 40(%rdi), %r8
-; AVX2-NEXT: orq %r13, %r15
-; AVX2-NEXT: andnq %r8, %r9, %r13
-; AVX2-NEXT: andq %r9, %r8
-; AVX2-NEXT: andnq %r14, %rsi, %r9
-; AVX2-NEXT: andq %rsi, %r14
-; AVX2-NEXT: andnq %rbx, %rax, %rsi
-; AVX2-NEXT: andq %rax, %rbx
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: orq %r14, %rbx
-; AVX2-NEXT: andnq %rax, %rcx, %r14
-; AVX2-NEXT: andq %rcx, %rax
-; AVX2-NEXT: orq %rbp, %rax
-; AVX2-NEXT: movq 8(%rdi), %rcx
-; AVX2-NEXT: orq %r15, %rax
-; AVX2-NEXT: andnq %rcx, %rdx, %r15
-; AVX2-NEXT: andq %rdx, %rcx
-; AVX2-NEXT: orq %r8, %rcx
-; AVX2-NEXT: orq %rbx, %rcx
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: movq %r11, 48(%rdi)
-; AVX2-NEXT: movq %r9, 56(%rdi)
-; AVX2-NEXT: movq %r12, 32(%rdi)
-; AVX2-NEXT: movq %r13, 40(%rdi)
-; AVX2-NEXT: movq %r10, 16(%rdi)
-; AVX2-NEXT: movq %rsi, 24(%rdi)
-; AVX2-NEXT: movq %r14, (%rdi)
-; AVX2-NEXT: movq %r15, 8(%rdi)
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: addq $8, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: reset_eq_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: pushq %rax
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq -48(%rsp,%rbx), %r8
-; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %rax
-; AVX512-NEXT: shldq %cl, %r8, %rax
-; AVX512-NEXT: movq -16(%rsp,%rbx), %r10
-; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi
-; AVX512-NEXT: shldq %cl, %r10, %rsi
-; AVX512-NEXT: movq -32(%rsp,%rbx), %r11
-; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT: movq %r15, %r9
-; AVX512-NEXT: shldq %cl, %r11, %r9
-; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx
-; AVX512-NEXT: shldq %cl, %rdx, %r8
-; AVX512-NEXT: shldq %cl, %r15, %r10
-; AVX512-NEXT: shldq %cl, %r14, %r11
-; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT: shldq %cl, %rbx, %rdx
-; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT: movq 24(%rdi), %rbx
-; AVX512-NEXT: movq 56(%rdi), %r14
-; AVX512-NEXT: movq 16(%rdi), %r15
-; AVX512-NEXT: movq 48(%rdi), %r13
-; AVX512-NEXT: movq 32(%rdi), %rbp
-; AVX512-NEXT: andnq %rbp, %r11, %r12
-; AVX512-NEXT: andq %r11, %rbp
-; AVX512-NEXT: andnq %r13, %r10, %r11
-; AVX512-NEXT: andq %r10, %r13
-; AVX512-NEXT: andnq %r15, %r8, %r10
-; AVX512-NEXT: andq %r8, %r15
-; AVX512-NEXT: movq 40(%rdi), %r8
-; AVX512-NEXT: orq %r13, %r15
-; AVX512-NEXT: andnq %r8, %r9, %r13
-; AVX512-NEXT: andq %r9, %r8
-; AVX512-NEXT: andnq %r14, %rsi, %r9
-; AVX512-NEXT: andq %rsi, %r14
-; AVX512-NEXT: andnq %rbx, %rax, %rsi
-; AVX512-NEXT: andq %rax, %rbx
-; AVX512-NEXT: movq (%rdi), %rax
-; AVX512-NEXT: orq %r14, %rbx
-; AVX512-NEXT: andnq %rax, %rcx, %r14
-; AVX512-NEXT: andq %rcx, %rax
-; AVX512-NEXT: orq %rbp, %rax
-; AVX512-NEXT: movq 8(%rdi), %rcx
-; AVX512-NEXT: orq %r15, %rax
-; AVX512-NEXT: andnq %rcx, %rdx, %r15
-; AVX512-NEXT: andq %rdx, %rcx
-; AVX512-NEXT: orq %r8, %rcx
-; AVX512-NEXT: orq %rbx, %rcx
-; AVX512-NEXT: orq %rax, %rcx
-; AVX512-NEXT: movq %r11, 48(%rdi)
-; AVX512-NEXT: movq %r9, 56(%rdi)
-; AVX512-NEXT: movq %r12, 32(%rdi)
-; AVX512-NEXT: movq %r13, 40(%rdi)
-; AVX512-NEXT: movq %r10, 16(%rdi)
-; AVX512-NEXT: movq %rsi, 24(%rdi)
-; AVX512-NEXT: movq %r14, (%rdi)
-; AVX512-NEXT: movq %r15, 8(%rdi)
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: addq $8, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %rem = and i32 %position, 511
- %ofs = zext nneg i32 %rem to i512
- %bit = shl nuw i512 1, %ofs
- %mask = xor i512 %bit, -1
- %ld = load i512, ptr %word
- %test = and i512 %ld, %bit
- %res = and i512 %ld, %mask
- %cmp = icmp eq i512 %test, 0
- store i512 %res, ptr %word
- ret i1 %cmp
+; AVX-LABEL: chain_reset_i256:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX-NEXT: movl $-2, %eax
+; AVX-NEXT: roll %cl, %eax
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: andl $28, %ecx
+; AVX-NEXT: andl %eax, (%rdi,%rcx)
+; AVX-NEXT: vmovdqu (%rdi), %ymm0
+; AVX-NEXT: movl (%rdi), %ecx
+; AVX-NEXT: movl (%rsi), %eax
+; AVX-NEXT: movl %ecx, (%rsi)
+; AVX-NEXT: movl (%rdx), %ecx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vptest %ymm0, %ymm0
+; AVX-NEXT: cmovnel %ecx, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %rem = and i32 %position, 255
+ %ofs = zext nneg i32 %rem to i256
+ %bit = shl nuw i256 1, %ofs
+ %ld0 = load i256, ptr %p0
+ %msk = xor i256 %bit, -1
+ %res = and i256 %ld0, %msk
+ store i256 %res, ptr %p0
+ %cmp = icmp ne i256 %res, 0
+ %ld1 = load i32, ptr %p1
+ %trunc = trunc i256 %res to i32
+ store i32 %trunc, ptr %p1
+ %ld2 = load i32, ptr %p2
+ %add = add i32 %ld1, %ld2
+ %sel = select i1 %cmp, i32 %ld2, i32 %add
+ ret i32 %sel
}
-define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
-; X86-LABEL: set_ne_i512:
+; BTC/BT/BTS sequence on same i128
+define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
+; X86-LABEL: sequence_i128:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
@@ -2803,27 +1265,9 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $272, %esp # imm = 0x110
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: subl $144, %esp
+; X86-NEXT: movb 20(%ebp), %ch
+; X86-NEXT: movb 12(%ebp), %cl
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -2832,225 +1276,85 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 56(%esp,%eax), %edx
+; X86-NEXT: movl 60(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %edi
+; X86-NEXT: movl 52(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: shll %cl, %edi
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edx), %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edx), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edx), %esi
+; X86-NEXT: movb %ch, %al
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 84(%esp,%eax), %edx
+; X86-NEXT: movl 88(%esp,%eax), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movzbl 20(%ebp), %ecx
+; X86-NEXT: shldl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 52(%edx), %esi
+; X86-NEXT: movl 80(%esp,%eax), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 92(%esp,%eax), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl 56(%edx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: movl 24(%edx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%eax), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 12(%eax), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl 60(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 28(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: negl %eax
-; X86-NEXT: movl 240(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl 32(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: movl (%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 16(%esi), %eax
+; X86-NEXT: shldl %cl, %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: movl 48(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 36(%esi), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl %esi, %edi
-; X86-NEXT: movl 52(%eax), %eax
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shll %cl, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl 8(%eax), %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl 12(%eax), %esi
+; X86-NEXT: xorl (%eax), %edi
+; X86-NEXT: xorl 4(%eax), %ebx
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: andb $96, %al
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 96(%esp,%eax), %eax
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setae %al
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl %ebx, 60(%edx)
-; X86-NEXT: movl %edi, 56(%edx)
-; X86-NEXT: movl %ecx, 52(%edx)
-; X86-NEXT: movl %esi, 44(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 40(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 36(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 32(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 28(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 24(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 20(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 16(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 8(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 4(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 48(%edx)
-; X86-NEXT: setne %al
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl %edx, 8(%ecx)
+; X86-NEXT: movl %esi, 12(%ecx)
+; X86-NEXT: movl %edi, (%ecx)
+; X86-NEXT: movl %ebx, 4(%ecx)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -3058,324 +1362,135 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: set_ne_i512:
+; SSE-LABEL: sequence_i128:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %ecx, %eax
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rbx
-; SSE-NEXT: movq (%rsp,%rbx), %rsi
-; SSE-NEXT: movq 8(%rsp,%rbx), %r14
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: shldq %cl, %rsi, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 32(%rsp,%rbx), %r8
-; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT: shldq %cl, %r8, %rbp
-; SSE-NEXT: movq 16(%rsp,%rbx), %r9
-; SSE-NEXT: movq 24(%rsp,%rbx), %r15
-; SSE-NEXT: movq %r15, %r10
-; SSE-NEXT: shldq %cl, %r9, %r10
-; SSE-NEXT: movq -8(%rsp,%rbx), %r11
-; SSE-NEXT: shldq %cl, %r11, %rsi
-; SSE-NEXT: shldq %cl, %r15, %r8
-; SSE-NEXT: shldq %cl, %r14, %r9
-; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %r11
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rbx
-; SSE-NEXT: movq 24(%rdi), %r15
-; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 16(%rdi), %r12
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %r8, %r13
-; SSE-NEXT: andq %rsi, %r12
-; SSE-NEXT: orq %r13, %r12
-; SSE-NEXT: movq %rcx, %r13
-; SSE-NEXT: andq %rbp, %r13
-; SSE-NEXT: andq %rax, %r15
-; SSE-NEXT: orq %r13, %r15
-; SSE-NEXT: movq 32(%rdi), %r14
-; SSE-NEXT: movq %r14, %rcx
-; SSE-NEXT: andq %r9, %rcx
-; SSE-NEXT: movq (%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rbx, %r13
-; SSE-NEXT: orq %rcx, %r13
-; SSE-NEXT: orq %r12, %r13
-; SSE-NEXT: movq 40(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r12
-; SSE-NEXT: andq %r10, %r12
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movq %rdx, %rax
-; SSE-NEXT: andq %r11, %rax
-; SSE-NEXT: orq %r12, %rax
-; SSE-NEXT: orq %r15, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT: orq %rcx, %r10
-; SSE-NEXT: orq %r14, %r9
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT: orq %rdx, %r11
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: movq %r8, 48(%rdi)
-; SSE-NEXT: movq %rbp, 56(%rdi)
-; SSE-NEXT: movq %r9, 32(%rdi)
-; SSE-NEXT: movq %r10, 40(%rdi)
-; SSE-NEXT: movq %rsi, 16(%rdi)
-; SSE-NEXT: movq %r15, 24(%rdi)
-; SSE-NEXT: movq %rbx, (%rdi)
-; SSE-NEXT: movq %r11, 8(%rdi)
-; SSE-NEXT: setne %al
-; SSE-NEXT: addq $56, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: movl $1, %r8d
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shldq %cl, %r8, %rsi
+; SSE-NEXT: movl $1, %r9d
+; SSE-NEXT: shlq %cl, %r9
+; SSE-NEXT: xorl %r11d, %r11d
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %r9, %rsi
+; SSE-NEXT: cmovneq %r11, %r9
+; SSE-NEXT: xorl %r10d, %r10d
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shldq %cl, %r8, %r10
+; SSE-NEXT: shlq %cl, %r8
+; SSE-NEXT: testb $64, %al
+; SSE-NEXT: cmovneq %r8, %r10
+; SSE-NEXT: cmovneq %r11, %r8
+; SSE-NEXT: xorq 8(%rdi), %rsi
+; SSE-NEXT: xorq (%rdi), %r9
+; SSE-NEXT: movl %edx, %ecx
+; SSE-NEXT: andb $32, %cl
+; SSE-NEXT: movq %r9, %rax
+; SSE-NEXT: shrdq %cl, %rsi, %rax
+; SSE-NEXT: movq %rsi, %r11
+; SSE-NEXT: shrq %cl, %r11
+; SSE-NEXT: testb $64, %dl
+; SSE-NEXT: cmoveq %rax, %r11
+; SSE-NEXT: btl %edx, %r11d
+; SSE-NEXT: setae %al
+; SSE-NEXT: orq %r10, %rsi
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: movq %r9, (%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: set_ne_i512:
+; AVX2-LABEL: sequence_i128:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $72, %rsp
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, (%rsp)
+; AVX2-NEXT: movl %ecx, %eax
; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rbx
-; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT: movq %rbp, %rax
-; AVX2-NEXT: shldq %cl, %rsi, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT: shldq %cl, %r8, %r13
-; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: shldq %cl, %r9, %r10
-; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT: shldq %cl, %r11, %rsi
-; AVX2-NEXT: shldq %cl, %r14, %r8
-; AVX2-NEXT: movq 16(%rdi), %r12
-; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r8, %r14
-; AVX2-NEXT: andq %rsi, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq 56(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r13, %r15
-; AVX2-NEXT: movq 24(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %rax, %r14
-; AVX2-NEXT: orq %r15, %r14
-; AVX2-NEXT: shldq %cl, %rbp, %r9
-; AVX2-NEXT: movq (%rsp,%rbx), %rdx
-; AVX2-NEXT: movq 32(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r9, %r15
-; AVX2-NEXT: shlxq %rcx, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq (%rdi), %rbx
-; AVX2-NEXT: movq %rbx, %rbp
-; AVX2-NEXT: andq %rax, %rbp
-; AVX2-NEXT: orq %r15, %rbp
-; AVX2-NEXT: orq %r12, %rbp
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %rdx, %r11
-; AVX2-NEXT: movq 40(%rdi), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: andq %r10, %rcx
-; AVX2-NEXT: movq 8(%rdi), %r15
-; AVX2-NEXT: movq %r15, %r12
-; AVX2-NEXT: andq %r11, %r12
-; AVX2-NEXT: orq %rcx, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT: orq %rax, %r10
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: orq %r15, %r11
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: movq %r8, 48(%rdi)
-; AVX2-NEXT: movq %r13, 56(%rdi)
-; AVX2-NEXT: movq %r9, 32(%rdi)
-; AVX2-NEXT: movq %r10, 40(%rdi)
-; AVX2-NEXT: movq %rsi, 16(%rdi)
-; AVX2-NEXT: movq %rcx, 24(%rdi)
-; AVX2-NEXT: movq %rbx, (%rdi)
-; AVX2-NEXT: movq %r11, 8(%rdi)
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: addq $72, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: movl $1, %r10d
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: shldq %cl, %r10, %rsi
+; AVX2-NEXT: shlxq %rcx, %r10, %r8
+; AVX2-NEXT: testb $64, %cl
+; AVX2-NEXT: cmovneq %r8, %rsi
+; AVX2-NEXT: cmovneq %r9, %r8
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shldq %cl, %r10, %r11
+; AVX2-NEXT: shlxq %rax, %r10, %r10
+; AVX2-NEXT: testb $64, %al
+; AVX2-NEXT: cmovneq %r10, %r11
+; AVX2-NEXT: cmovneq %r9, %r10
+; AVX2-NEXT: xorq 8(%rdi), %rsi
+; AVX2-NEXT: xorq (%rdi), %r8
+; AVX2-NEXT: movl %edx, %ecx
+; AVX2-NEXT: andb $32, %cl
+; AVX2-NEXT: movq %r8, %rax
+; AVX2-NEXT: shrdq %cl, %rsi, %rax
+; AVX2-NEXT: shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT: testb $64, %dl
+; AVX2-NEXT: cmoveq %rax, %rcx
+; AVX2-NEXT: btl %edx, %ecx
+; AVX2-NEXT: setae %al
+; AVX2-NEXT: orq %r11, %rsi
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: movq %r8, (%rdi)
+; AVX2-NEXT: movq %rsi, 8(%rdi)
; AVX2-NEXT: retq
;
-; AVX512-LABEL: set_ne_i512:
+; AVX512-LABEL: sequence_i128:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $72, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, (%rsp)
+; AVX512-NEXT: movl %ecx, %eax
; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT: movq %rbp, %rax
-; AVX512-NEXT: shldq %cl, %rsi, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT: shldq %cl, %r8, %r13
-; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %r10
-; AVX512-NEXT: shldq %cl, %r9, %r10
-; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT: shldq %cl, %r11, %rsi
-; AVX512-NEXT: shldq %cl, %r14, %r8
-; AVX512-NEXT: movq 16(%rdi), %r12
-; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r8, %r14
-; AVX512-NEXT: andq %rsi, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq 56(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r13, %r15
-; AVX512-NEXT: movq 24(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %rax, %r14
-; AVX512-NEXT: orq %r15, %r14
-; AVX512-NEXT: shldq %cl, %rbp, %r9
-; AVX512-NEXT: movq (%rsp,%rbx), %rdx
-; AVX512-NEXT: movq 32(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r9, %r15
-; AVX512-NEXT: shlxq %rcx, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq (%rdi), %rbx
-; AVX512-NEXT: movq %rbx, %rbp
-; AVX512-NEXT: andq %rax, %rbp
-; AVX512-NEXT: orq %r15, %rbp
-; AVX512-NEXT: orq %r12, %rbp
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rdx, %r11
-; AVX512-NEXT: movq 40(%rdi), %rax
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andq %r10, %rcx
-; AVX512-NEXT: movq 8(%rdi), %r15
-; AVX512-NEXT: movq %r15, %r12
-; AVX512-NEXT: andq %r11, %r12
-; AVX512-NEXT: orq %rcx, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT: orq %rax, %r10
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT: orq %r15, %r11
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT: orq %rbp, %r12
-; AVX512-NEXT: movq %r8, 48(%rdi)
-; AVX512-NEXT: movq %r13, 56(%rdi)
-; AVX512-NEXT: movq %r9, 32(%rdi)
-; AVX512-NEXT: movq %r10, 40(%rdi)
-; AVX512-NEXT: movq %rsi, 16(%rdi)
-; AVX512-NEXT: movq %rcx, 24(%rdi)
-; AVX512-NEXT: movq %rbx, (%rdi)
-; AVX512-NEXT: movq %r11, 8(%rdi)
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: addq $72, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: movl $1, %r9d
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: shldq %cl, %r9, %rsi
+; AVX512-NEXT: xorl %r10d, %r10d
+; AVX512-NEXT: shlxq %rcx, %r9, %r8
+; AVX512-NEXT: testb $64, %cl
+; AVX512-NEXT: cmovneq %r8, %rsi
+; AVX512-NEXT: cmovneq %r10, %r8
+; AVX512-NEXT: xorl %r11d, %r11d
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shldq %cl, %r9, %r11
+; AVX512-NEXT: shlxq %rax, %r9, %r9
+; AVX512-NEXT: testb $64, %al
+; AVX512-NEXT: cmovneq %r9, %r11
+; AVX512-NEXT: cmovneq %r10, %r9
+; AVX512-NEXT: xorq 8(%rdi), %rsi
+; AVX512-NEXT: xorq (%rdi), %r8
+; AVX512-NEXT: movl %edx, %ecx
+; AVX512-NEXT: andb $32, %cl
+; AVX512-NEXT: movq %r8, %rax
+; AVX512-NEXT: shrdq %cl, %rsi, %rax
+; AVX512-NEXT: shrxq %rcx, %rsi, %rcx
+; AVX512-NEXT: testb $64, %dl
+; AVX512-NEXT: cmoveq %rax, %rcx
+; AVX512-NEXT: btl %edx, %ecx
+; AVX512-NEXT: setae %al
+; AVX512-NEXT: orq %r11, %rsi
+; AVX512-NEXT: orq %r9, %r8
+; AVX512-NEXT: movq %r8, (%rdi)
+; AVX512-NEXT: movq %rsi, 8(%rdi)
; AVX512-NEXT: retq
- %rem = and i32 %position, 511
- %ofs = zext nneg i32 %rem to i512
- %bit = shl nuw i512 1, %ofs
- %ld = load i512, ptr %word
- %test = and i512 %ld, %bit
- %res = or i512 %ld, %bit
- %cmp = icmp ne i512 %test, 0
- store i512 %res, ptr %word
- ret i1 %cmp
+ %rem0 = and i32 %pos0, 127
+ %rem1 = and i32 %pos1, 127
+ %rem2 = and i32 %pos2, 127
+ %ofs0 = zext nneg i32 %rem0 to i128
+ %ofs1 = zext nneg i32 %rem1 to i128
+ %ofs2 = zext nneg i32 %rem2 to i128
+ %bit0 = shl nuw i128 1, %ofs0
+ %bit1 = shl nuw i128 1, %ofs1
+ %bit2 = shl nuw i128 1, %ofs2
+ %ld = load i128, ptr %word
+ %res0 = xor i128 %ld, %bit0
+ %test1 = and i128 %res0, %bit1
+ %cmp1 = icmp eq i128 %test1, 0
+ %res2 = or i128 %res0, %bit2
+ store i128 %res2, ptr %word
+ ret i1 %cmp1
}
-define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
-; X86-LABEL: init_eq_i512:
+define i32 @blsr_u512(ptr %word) nounwind {
+; X86-LABEL: blsr_u512:
; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
@@ -3383,126 +1498,215 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $432, %esp # imm = 0x1B0
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: andl $60, %edx
-; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl %edx, %esi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%esi), %eax
+; X86-NEXT: subl $240, %esp
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: movl 12(%ebx), %esi
+; X86-NEXT: movl 28(%ebx), %eax
+; X86-NEXT: movl 60(%ebx), %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%esi), %eax
-; X86-NEXT: movl 48(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%esi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl 16(%ebp), %ebx
-; X86-NEXT: movzbl %bl, %esi
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 44(%ebx), %edx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %esi, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 20(%ebx), %edx
+; X86-NEXT: movl 52(%ebx), %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl 4(%ebx), %edi
+; X86-NEXT: movl 36(%ebx), %esi
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 24(%ebx), %edx
+; X86-NEXT: movl 56(%ebx), %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl 8(%ebx), %ecx
+; X86-NEXT: movl 40(%ebx), %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl 16(%ebx), %edx
+; X86-NEXT: movl 48(%ebx), %esi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl (%ebx), %esi
+; X86-NEXT: movl 32(%ebx), %ebx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: je .LBB26_1
+; X86-NEXT: # %bb.2: # %cond.false
+; X86-NEXT: testl %ebx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: jne .LBB26_3
+; X86-NEXT: # %bb.4: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl $32, %eax
+; X86-NEXT: jmp .LBB26_5
+; X86-NEXT: .LBB26_1:
+; X86-NEXT: movl $512, %ecx # imm = 0x200
+; X86-NEXT: jmp .LBB26_41
+; X86-NEXT: .LBB26_3:
+; X86-NEXT: rep bsfl %ebx, %eax
+; X86-NEXT: .LBB26_5: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: jne .LBB26_6
+; X86-NEXT: # %bb.7: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: jmp .LBB26_8
+; X86-NEXT: .LBB26_6:
+; X86-NEXT: rep bsfl %ecx, %ecx
+; X86-NEXT: .LBB26_8: # %cond.false
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: jne .LBB26_10
+; X86-NEXT: # %bb.9: # %cond.false
+; X86-NEXT: addl $64, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB26_10: # %cond.false
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jne .LBB26_11
+; X86-NEXT: # %bb.12: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: je .LBB26_15
+; X86-NEXT: .LBB26_14:
+; X86-NEXT: rep bsfl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: je .LBB26_17
+; X86-NEXT: jmp .LBB26_18
+; X86-NEXT: .LBB26_11:
+; X86-NEXT: rep bsfl %esi, %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB26_14
+; X86-NEXT: .LBB26_15: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: jne .LBB26_18
+; X86-NEXT: .LBB26_17: # %cond.false
+; X86-NEXT: addl $64, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: .LBB26_18: # %cond.false
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: jne .LBB26_20
+; X86-NEXT: # %bb.19: # %cond.false
+; X86-NEXT: subl $-128, %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: .LBB26_20: # %cond.false
+; X86-NEXT: addl $256, %eax # imm = 0x100
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: jne .LBB26_21
+; X86-NEXT: # %bb.22: # %cond.false
+; X86-NEXT: rep bsfl %edi, %ebx
+; X86-NEXT: addl $32, %ebx
+; X86-NEXT: jmp .LBB26_23
+; X86-NEXT: .LBB26_21:
+; X86-NEXT: rep bsfl %edx, %ebx
+; X86-NEXT: .LBB26_23: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: testl %ecx, %ecx
+; X86-NEXT: jne .LBB26_24
+; X86-NEXT: # %bb.25: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: je .LBB26_27
+; X86-NEXT: jmp .LBB26_28
+; X86-NEXT: .LBB26_24:
+; X86-NEXT: rep bsfl %ecx, %ecx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: jne .LBB26_28
+; X86-NEXT: .LBB26_27: # %cond.false
+; X86-NEXT: addl $64, %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: .LBB26_28: # %cond.false
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: jne .LBB26_29
+; X86-NEXT: # %bb.30: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: je .LBB26_33
+; X86-NEXT: .LBB26_32:
+; X86-NEXT: rep bsfl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: je .LBB26_35
+; X86-NEXT: jmp .LBB26_36
+; X86-NEXT: .LBB26_29:
+; X86-NEXT: rep bsfl %esi, %ecx
+; X86-NEXT: testl %edx, %edx
+; X86-NEXT: jne .LBB26_32
+; X86-NEXT: .LBB26_33: # %cond.false
+; X86-NEXT: rep bsfl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: addl $32, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: jne .LBB26_36
+; X86-NEXT: .LBB26_35: # %cond.false
+; X86-NEXT: addl $64, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: .LBB26_36: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: jne .LBB26_38
+; X86-NEXT: # %bb.37: # %cond.false
+; X86-NEXT: subl $-128, %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: .LBB26_38: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: jne .LBB26_40
+; X86-NEXT: # %bb.39: # %cond.false
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: .LBB26_40: # %cond.false
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: .LBB26_41: # %cond.end
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl %esi, %edx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -3518,6 +1722,7 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -3534,1948 +1739,133 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%ebx), %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: movl 56(%edx), %edi
+; X86-NEXT: movl 60(%edx), %esi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%ebx), %esi
+; X86-NEXT: movl 52(%edx), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: notl %edi
+; X86-NEXT: andl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebx), %esi
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl 44(%edx), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%ebx), %eax
+; X86-NEXT: movl 36(%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%ebx), %esi
+; X86-NEXT: movl 32(%edx), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%ebx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl 28(%edx), %esi
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 24(%edx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl 56(%edi), %ebx
-; X86-NEXT: movl 60(%edi), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 52(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 48(%edi), %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 4(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edx), %eax
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl 40(%edi), %ebx
-; X86-NEXT: movl 44(%edi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 36(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 32(%edi), %ebx
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 28(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 24(%edi), %ebx
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 20(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 16(%edi), %ebx
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %edx
; X86-NEXT: notl %edx
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 12(%edi), %eax
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl 12(%ebx), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: notl %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 8(%edi), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: orl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%ebx), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebx), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: notl %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 4(%edi), %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl (%edi), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 208(%esp,%eax), %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 60(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 56(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 52(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 44(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 40(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 36(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 32(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 28(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 24(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 20(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 16(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %esi, 48(%eax)
-; X86-NEXT: sete %al
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
-;
-; SSE-LABEL: init_eq_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $216, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %r10
-; SSE-NEXT: movq 184(%rsp,%r10), %r11
-; SSE-NEXT: movq 192(%rsp,%r10), %rsi
-; SSE-NEXT: movq %rsi, %r13
-; SSE-NEXT: shldq %cl, %r11, %r13
-; SSE-NEXT: movq 200(%rsp,%r10), %r15
-; SSE-NEXT: shldq %cl, %rsi, %r15
-; SSE-NEXT: movq 168(%rsp,%r10), %rbx
-; SSE-NEXT: movq 176(%rsp,%r10), %rsi
-; SSE-NEXT: movq %rsi, %r14
-; SSE-NEXT: shldq %cl, %rbx, %r14
-; SSE-NEXT: shldq %cl, %rsi, %r11
-; SSE-NEXT: movq 152(%rsp,%r10), %rax
-; SSE-NEXT: movq 160(%rsp,%r10), %r8
-; SSE-NEXT: movq %r8, %r12
-; SSE-NEXT: shldq %cl, %rax, %r12
-; SSE-NEXT: shldq %cl, %r8, %rbx
-; SSE-NEXT: movq 144(%rsp,%r10), %r9
-; SSE-NEXT: movq %r9, %r8
-; SSE-NEXT: shlq %cl, %r8
-; SSE-NEXT: shldq %cl, %r9, %rax
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movl %edx, %edx
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq 16(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %rsi
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rsi, %r13
-; SSE-NEXT: andq %rdx, %r12
-; SSE-NEXT: orq %r13, %r12
-; SSE-NEXT: movq %r15, %rsi
-; SSE-NEXT: movq 56(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %r15
-; SSE-NEXT: movq %rbx, %r13
-; SSE-NEXT: movq 24(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %rbx
-; SSE-NEXT: orq %r15, %rbx
-; SSE-NEXT: movq %r14, %rbp
-; SSE-NEXT: movq 32(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %r14
-; SSE-NEXT: movq %r8, %r15
-; SSE-NEXT: movq (%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %r8
-; SSE-NEXT: orq %r14, %r8
-; SSE-NEXT: orq %r12, %r8
-; SSE-NEXT: movq %r11, %r12
-; SSE-NEXT: movq 40(%rdi), %r9
-; SSE-NEXT: andq %r9, %r11
-; SSE-NEXT: movq %rax, %r14
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %rax
-; SSE-NEXT: orq %r11, %rax
-; SSE-NEXT: orq %rbx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: notq %rax
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: movq 56(%rsp,%r10), %r11
-; SSE-NEXT: movq 64(%rsp,%r10), %rax
-; SSE-NEXT: movq %rax, %rbx
-; SSE-NEXT: shldq %cl, %r11, %rbx
-; SSE-NEXT: orq %rbx, %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movq 72(%rsp,%r10), %rbx
-; SSE-NEXT: shldq %cl, %rax, %rbx
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT: orq %rbx, %rsi
-; SSE-NEXT: notq %rbp
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: movq 40(%rsp,%r10), %rax
-; SSE-NEXT: movq 48(%rsp,%r10), %rdx
-; SSE-NEXT: movq %rdx, %rbx
-; SSE-NEXT: shldq %cl, %rax, %rbx
-; SSE-NEXT: orq %rbx, %rbp
-; SSE-NEXT: notq %r12
-; SSE-NEXT: andq %r9, %r12
-; SSE-NEXT: shldq %cl, %rdx, %r11
-; SSE-NEXT: movq 24(%rsp,%r10), %r9
-; SSE-NEXT: movq 32(%rsp,%r10), %rdx
-; SSE-NEXT: movq %rdx, %rbx
-; SSE-NEXT: shldq %cl, %r9, %rbx
-; SSE-NEXT: orq %r11, %r12
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: notq %r11
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: orq %rbx, %r11
-; SSE-NEXT: notq %r13
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT: orq %rax, %r13
-; SSE-NEXT: notq %r15
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: movq 16(%rsp,%r10), %rax
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: orq %rdx, %r15
-; SSE-NEXT: notq %r14
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shldq %cl, %rax, %r9
-; SSE-NEXT: orq %r9, %r14
-; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: movq %rax, 48(%rdi)
-; SSE-NEXT: movq %rsi, 56(%rdi)
-; SSE-NEXT: movq %rbp, 32(%rdi)
-; SSE-NEXT: movq %r12, 40(%rdi)
-; SSE-NEXT: movq %r11, 16(%rdi)
-; SSE-NEXT: movq %r13, 24(%rdi)
-; SSE-NEXT: movq %r15, (%rdi)
-; SSE-NEXT: movq %r14, 8(%rdi)
-; SSE-NEXT: sete %al
-; SSE-NEXT: addq $216, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: init_eq_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $200, %rsp
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %r8d
-; AVX2-NEXT: andl $63, %r8d
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rsi
-; AVX2-NEXT: movq 144(%rsp,%rsi), %r11
-; AVX2-NEXT: movq 152(%rsp,%rsi), %r12
-; AVX2-NEXT: movq %r12, %r10
-; AVX2-NEXT: movl %r8d, %ecx
-; AVX2-NEXT: shldq %cl, %r11, %r10
-; AVX2-NEXT: movq 176(%rsp,%rsi), %r14
-; AVX2-NEXT: movq 184(%rsp,%rsi), %r9
-; AVX2-NEXT: shldq %cl, %r14, %r9
-; AVX2-NEXT: movq 160(%rsp,%rsi), %r15
-; AVX2-NEXT: movq 168(%rsp,%rsi), %r13
-; AVX2-NEXT: movq %r13, %rbx
-; AVX2-NEXT: shldq %cl, %r15, %rbx
-; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp
-; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 136(%rsp,%rsi), %rax
-; AVX2-NEXT: shldq %cl, %rax, %r11
-; AVX2-NEXT: shldq %cl, %r13, %r14
-; AVX2-NEXT: shldq %cl, %r12, %r15
-; AVX2-NEXT: shldq %cl, %rbp, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl %edx, %edx
-; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, (%rsp)
-; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq 16(%rdi), %r12
-; AVX2-NEXT: movq 48(%rdi), %rbp
-; AVX2-NEXT: movq 32(%rdi), %r13
-; AVX2-NEXT: andnq %r13, %r15, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r15, %r13
-; AVX2-NEXT: andnq %rbp, %r14, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r14, %rbp
-; AVX2-NEXT: andnq %r12, %r11, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r11, %r12
-; AVX2-NEXT: movq 40(%rdi), %rax
-; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: andnq %rax, %rbx, %rcx
-; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq %rax, %rbp
-; AVX2-NEXT: andq %rbx, %rbp
-; AVX2-NEXT: movq 56(%rdi), %rcx
-; AVX2-NEXT: andnq %rcx, %r9, %rbx
-; AVX2-NEXT: andq %r9, %rcx
-; AVX2-NEXT: movq 24(%rdi), %rax
-; AVX2-NEXT: andnq %rax, %r10, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r10, %rax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: movq (%rdi), %r10
-; AVX2-NEXT: andnq %r10, %rcx, %r15
-; AVX2-NEXT: andq %rcx, %r10
-; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq 48(%rsp,%rsi), %r11
-; AVX2-NEXT: movq %r11, %r9
-; AVX2-NEXT: movl %r8d, %ecx
-; AVX2-NEXT: shldq %cl, %rdx, %r9
-; AVX2-NEXT: orq %r13, %r10
-; AVX2-NEXT: orq %r12, %r10
-; AVX2-NEXT: movq 8(%rdi), %r13
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: andnq %r13, %rcx, %r12
-; AVX2-NEXT: andq %rcx, %r13
-; AVX2-NEXT: orq %rbp, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq 56(%rsp,%rsi), %rax
-; AVX2-NEXT: movl %r8d, %ecx
-; AVX2-NEXT: shldq %cl, %r11, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: orq %r9, %r14
-; AVX2-NEXT: orq %rax, %rbx
-; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 24(%rsp,%rsi), %rax
-; AVX2-NEXT: movq 32(%rsp,%rsi), %r9
-; AVX2-NEXT: movq %r9, %r11
-; AVX2-NEXT: shldq %cl, %rax, %r11
-; AVX2-NEXT: shldq %cl, %r9, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT: orq %r11, %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: orq %rdx, %rbx
-; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq 16(%rsp,%rsi), %r9
-; AVX2-NEXT: movq %r9, %r11
-; AVX2-NEXT: shldq %cl, %rdx, %r11
-; AVX2-NEXT: shldq %cl, %r9, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: orq %r11, %r9
-; AVX2-NEXT: movq (%rsp,%rsi), %rsi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: orq %rax, %r11
-; AVX2-NEXT: shlxq %r8, %rsi, %rax
-; AVX2-NEXT: shldq %cl, %rsi, %rdx
-; AVX2-NEXT: orq %rax, %r15
-; AVX2-NEXT: orq %rdx, %r12
-; AVX2-NEXT: orq %r10, %r13
-; AVX2-NEXT: movq %r14, 48(%rdi)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: movq %rax, 56(%rdi)
-; AVX2-NEXT: movq %rbp, 32(%rdi)
-; AVX2-NEXT: movq %rbx, 40(%rdi)
-; AVX2-NEXT: movq %r9, 16(%rdi)
-; AVX2-NEXT: movq %r11, 24(%rdi)
-; AVX2-NEXT: movq %r15, (%rdi)
-; AVX2-NEXT: movq %r12, 8(%rdi)
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: addq $200, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: init_eq_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $184, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rsi
-; AVX512-NEXT: movq 128(%rsp,%rsi), %r10
-; AVX512-NEXT: movq 136(%rsp,%rsi), %r12
-; AVX512-NEXT: movq %r12, %rax
-; AVX512-NEXT: shldq %cl, %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 160(%rsp,%rsi), %r14
-; AVX512-NEXT: movq 168(%rsp,%rsi), %rax
-; AVX512-NEXT: shldq %cl, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 144(%rsp,%rsi), %r15
-; AVX512-NEXT: movq 152(%rsp,%rsi), %r11
-; AVX512-NEXT: movq %r11, %rbx
-; AVX512-NEXT: shldq %cl, %r15, %rbx
-; AVX512-NEXT: movq 120(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rax, %r10
-; AVX512-NEXT: shldq %cl, %r11, %r14
-; AVX512-NEXT: movq %rdi, %r9
-; AVX512-NEXT: movq 112(%rsp,%rsi), %r11
-; AVX512-NEXT: shldq %cl, %r12, %r15
-; AVX512-NEXT: movl %edx, %edx
-; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq 16(%rdi), %r12
-; AVX512-NEXT: movq 48(%rdi), %r13
-; AVX512-NEXT: movq 32(%rdi), %rbp
-; AVX512-NEXT: andnq %rbp, %r15, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r15, %rbp
-; AVX512-NEXT: andnq %r13, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r14, %r13
-; AVX512-NEXT: andnq %r12, %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r10, %r12
-; AVX512-NEXT: movq 40(%rdi), %r8
-; AVX512-NEXT: orq %r13, %r12
-; AVX512-NEXT: andnq %r8, %rbx, %rdi
-; AVX512-NEXT: andq %rbx, %r8
-; AVX512-NEXT: movq 56(%r9), %r13
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT: andnq %r13, %rdx, %r10
-; AVX512-NEXT: andq %rdx, %r13
-; AVX512-NEXT: movq 24(%r9), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT: andnq %rax, %rdx, %r15
-; AVX512-NEXT: andq %rdx, %rax
-; AVX512-NEXT: orq %r13, %rax
-; AVX512-NEXT: shlxq %rcx, %r11, %r13
-; AVX512-NEXT: movq (%r9), %rdx
-; AVX512-NEXT: andnq %rdx, %r13, %r14
-; AVX512-NEXT: andq %r13, %rdx
-; AVX512-NEXT: orq %rbp, %rdx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r11, %rbp
-; AVX512-NEXT: orq %r12, %rdx
-; AVX512-NEXT: movq 8(%r9), %r13
-; AVX512-NEXT: andnq %r13, %rbp, %rbx
-; AVX512-NEXT: andq %rbp, %r13
-; AVX512-NEXT: orq %r8, %r13
-; AVX512-NEXT: movq 24(%rsp,%rsi), %r8
-; AVX512-NEXT: orq %rax, %r13
-; AVX512-NEXT: movq 32(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, %r12
-; AVX512-NEXT: shldq %cl, %r8, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: orq %r12, %r11
-; AVX512-NEXT: movq 40(%rsp,%rsi), %r12
-; AVX512-NEXT: shldq %cl, %rax, %r12
-; AVX512-NEXT: orq %r12, %r10
-; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 8(%rsp,%rsi), %rax
-; AVX512-NEXT: movq 16(%rsp,%rsi), %r12
-; AVX512-NEXT: movq %r12, %rbp
-; AVX512-NEXT: shldq %cl, %rax, %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: orq %rbp, %r10
-; AVX512-NEXT: shldq %cl, %r12, %r8
-; AVX512-NEXT: orq %r8, %rdi
-; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq -8(%rsp,%rsi), %r8
-; AVX512-NEXT: movq (%rsp,%rsi), %r12
-; AVX512-NEXT: movq %r12, %rbp
-; AVX512-NEXT: shldq %cl, %r8, %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT: orq %rbp, %rdi
-; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi
-; AVX512-NEXT: shldq %cl, %r12, %rax
-; AVX512-NEXT: orq %rax, %r15
-; AVX512-NEXT: shlxq %rcx, %rsi, %rax
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rsi, %r8
-; AVX512-NEXT: orq %rax, %r14
-; AVX512-NEXT: orq %r8, %rbx
-; AVX512-NEXT: orq %rdx, %r13
-; AVX512-NEXT: movq %r11, 48(%r9)
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, 56(%r9)
-; AVX512-NEXT: movq %r10, 32(%r9)
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, 40(%r9)
-; AVX512-NEXT: movq %rdi, 16(%r9)
-; AVX512-NEXT: movq %r15, 24(%r9)
-; AVX512-NEXT: movq %r14, (%r9)
-; AVX512-NEXT: movq %rbx, 8(%r9)
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: addq $184, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %rem = and i32 %position, 511
- %ofs = zext nneg i32 %rem to i512
- %bit = shl nuw i512 1, %ofs
- %mask = xor i512 %bit, -1
- %val0 = zext i1 %value to i512
- %val = shl nuw i512 %val0, %ofs
- %ld = load i512, ptr %word
- %test = and i512 %ld, %bit
- %res0 = and i512 %ld, %mask
- %res = or i512 %res0, %val
- %cmp = icmp eq i512 %test, 0
- store i512 %res, ptr %word
- ret i1 %cmp
-}
-
-; i4096
-
-define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
-; X86-LABEL: test_ne_i4096:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $1792, %esp # imm = 0x700
-; X86-NEXT: movl 12(%ebp), %ebx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: shrl $3, %ecx
-; X86-NEXT: andl $508, %ecx # imm = 0x1FC
-; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 248(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 252(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ebx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 504(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 508(%esi), %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 120(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 124(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 376(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 380(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 184(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 188(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 440(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 444(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 312(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 316(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 216(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 220(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 472(%esi), %edi
-; X86-NEXT: movl 476(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 88(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 92(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 344(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 348(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 152(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 156(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 408(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 412(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 280(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 284(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 232(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 236(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 488(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 492(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 104(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 108(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 360(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 364(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 168(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 172(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 424(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 428(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 296(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 300(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 200(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 204(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 456(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 460(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 76(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 328(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 332(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 136(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 140(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 392(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 396(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 264(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 268(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 240(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 244(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 496(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 500(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 112(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 116(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 368(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 372(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 176(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 180(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 432(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 436(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 304(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 308(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 208(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 212(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 464(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 468(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 80(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 84(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 336(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 340(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 144(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 148(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 400(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 404(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 272(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 276(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 224(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 228(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 480(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 484(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 100(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 352(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 356(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 160(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 164(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 416(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 420(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 288(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 292(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 192(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 196(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 448(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 452(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 64(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 68(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 320(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 324(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 128(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 132(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: movl 256(%esi), %edi
-; X86-NEXT: movl 260(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 388(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 4(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shrdl $1, %eax, %edi
-; X86-NEXT: shrl %eax
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: notb %cl
-; X86-NEXT: shrdl %cl, %eax, %edi
-; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: movb $32, %cl
-; X86-NEXT: testb %cl, %cl
-; X86-NEXT: movl (%esi), %eax
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl (%ebx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: jne .LBB20_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: .LBB20_2:
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 320(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 64(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 448(%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 192(%eax), %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 288(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 32(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 416(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 160(%eax), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 352(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 96(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 480(%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 224(%eax), %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 272(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 16(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 400(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 144(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 336(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 80(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 464(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 208(%eax), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 304(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 48(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 432(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 176(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 368(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 112(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 496(%eax), %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: andl 240(%eax), %ebx
-; X86-NEXT: orl %ecx, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 264(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 8(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 392(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 136(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 328(%ebx), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 72(%ebx), %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 456(%ebx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 200(%ebx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 296(%ebx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 40(%ebx), %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 424(%ebx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 168(%ebx), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 360(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 104(%ebx), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 488(%ebx), %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 232(%ebx), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 280(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 24(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 408(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 152(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 344(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 88(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 472(%ebx), %eax
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, 24(%ecx)
+; X86-NEXT: movl %esi, 20(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 216(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 312(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 56(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 440(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 184(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 376(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 120(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 504(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 248(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 324(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 68(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 452(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 196(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 292(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 36(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 420(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 164(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 356(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 100(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 484(%ebx), %eax
+; X86-NEXT: movl %esi, 16(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 228(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 276(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 20(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 404(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 148(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 340(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 84(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 468(%ebx), %eax
+; X86-NEXT: movl %esi, 12(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 212(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %esi, 8(%ecx)
+; X86-NEXT: movl %edi, 4(%ecx)
+; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 308(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 52(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %eax, 28(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 436(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 180(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %eax, 32(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 372(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 116(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %eax, 36(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 500(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 244(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, 40(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 268(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 12(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %eax, 44(%ecx)
+; X86-NEXT: movl %edx, 48(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 396(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 140(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %eax, 52(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 332(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 76(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %eax, 56(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 460(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 204(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 300(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 44(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %eax, 60(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 428(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 172(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 364(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 108(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 492(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 236(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 284(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 28(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 412(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 156(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 348(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 92(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 476(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 220(%ebx), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 316(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 60(%ebx), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 444(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 188(%ebx), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 380(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 124(%ebx), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 508(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: andl 252(%esi), %ebx
-; X86-NEXT: orl %ecx, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: negl %ecx
-; X86-NEXT: movl 1648(%esp,%ecx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: andl 128(%edx), %ecx
-; X86-NEXT: andl 384(%edx), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: andl (%edx), %eax
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 256(%edx), %eax
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 260(%edx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 4(%edx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 132(%edx), %eax
-; X86-NEXT: andl 388(%edx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: setne %al
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -5483,1545 +1873,157 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: test_ne_i4096:
+; SSE-LABEL: blsr_u512:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
; SSE-NEXT: pushq %r15
; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $1576, %rsp # imm = 0x628
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl %esi, %eax
-; SSE-NEXT: andl $4032, %eax # imm = 0xFC0
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: negl %eax
-; SSE-NEXT: movslq %eax, %rsi
-; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1304(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1560(%rsp,%rsi), %rax
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1176(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1432(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1240(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1496(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1112(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; SSE-NEXT: movq 1368(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1272(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1528(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1144(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1400(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1208(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1464(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1080(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1336(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1288(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1544(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1160(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1416(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1216(%rsp,%rsi), %r11
-; SSE-NEXT: movq 1224(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %r11, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1480(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1088(%rsp,%rsi), %r9
-; SSE-NEXT: movq 1096(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %r9, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1352(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1248(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1512(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1120(%rsp,%rsi), %rax
-; SSE-NEXT: movq 1128(%rsp,%rsi), %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rax, %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1376(%rsp,%rsi), %r13
-; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx
-; SSE-NEXT: movq %rbx, %r8
-; SSE-NEXT: shldq %cl, %r13, %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1192(%rsp,%rsi), %r15
-; SSE-NEXT: movq %r15, %r14
-; SSE-NEXT: shldq %cl, %rdx, %r14
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1440(%rsp,%rsi), %r10
-; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, %r14
-; SSE-NEXT: shldq %cl, %r10, %r14
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1312(%rsp,%rsi), %r14
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp
-; SSE-NEXT: movq %rbp, %r12
-; SSE-NEXT: shldq %cl, %r14, %r12
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx
-; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rbp, %r14
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r9
-; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %rbp
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r13
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r12, %r15
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r12, %r10
-; SSE-NEXT: andq 384(%rdi), %r10
-; SSE-NEXT: andq 128(%rdi), %r15
-; SSE-NEXT: andq 320(%rdi), %r13
-; SSE-NEXT: andq 64(%rdi), %rax
-; SSE-NEXT: orq %r10, %r15
-; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: andq 448(%rdi), %r9
-; SSE-NEXT: andq 192(%rdi), %rbp
-; SSE-NEXT: orq %r9, %rbp
-; SSE-NEXT: orq %rax, %rbp
-; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq 288(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 32(%rdi), %r9
-; SSE-NEXT: andq 416(%rdi), %rdx
-; SSE-NEXT: andq 160(%rdi), %r11
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: orq %rdx, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 352(%rdi), %rdx
-; SSE-NEXT: orq %r9, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 96(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 480(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 224(%rdi), %r8
-; SSE-NEXT: orq %rax, %r8
-; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq 272(%rdi), %r14
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 16(%rdi), %rax
-; SSE-NEXT: orq %r14, %rax
-; SSE-NEXT: movq %rax, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 400(%rdi), %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 144(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: movq %rax, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 336(%rdi), %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 80(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 464(%rdi), %rdx
-; SSE-NEXT: orq %r9, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 208(%rdi), %r11
-; SSE-NEXT: orq %rdx, %r11
-; SSE-NEXT: orq %rax, %r11
-; SSE-NEXT: orq %r8, %r11
-; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload
-; SSE-NEXT: andq 304(%rdi), %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 48(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 432(%rdi), %r9
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 176(%rdi), %r8
-; SSE-NEXT: orq %r9, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 368(%rdi), %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 112(%rdi), %rax
-; SSE-NEXT: orq %r10, %r8
-; SSE-NEXT: movq %r8, %r10
-; SSE-NEXT: orq %r9, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 496(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT: andq 240(%rdi), %rbp
-; SSE-NEXT: orq %r8, %rbp
-; SSE-NEXT: orq %rax, %rbp
-; SSE-NEXT: orq %r10, %rbp
-; SSE-NEXT: orq %r11, %rbp
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 392(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT: andq 136(%rdi), %r12
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 328(%rdi), %rdx
-; SSE-NEXT: orq %rax, %r12
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 72(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 456(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE-NEXT: andq 200(%rdi), %r13
-; SSE-NEXT: orq %rax, %r13
-; SSE-NEXT: orq %rdx, %r13
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 296(%rdi), %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 40(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 424(%rdi), %r8
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 168(%rdi), %rdx
-; SSE-NEXT: orq %r8, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 360(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 104(%rdi), %rax
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: movq %rax, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 488(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: andq 232(%rdi), %r15
-; SSE-NEXT: orq %rax, %r15
-; SSE-NEXT: orq %r8, %r15
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 280(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 24(%rdi), %rax
-; SSE-NEXT: orq %rdx, %r15
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 408(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 152(%rdi), %rax
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 344(%rdi), %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 88(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 472(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT: andq 216(%rdi), %r14
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: orq %rax, %r14
-; SSE-NEXT: orq %r8, %r14
-; SSE-NEXT: orq %r10, %r14
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 312(%rdi), %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT: andq 56(%rdi), %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 440(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 184(%rdi), %r9
-; SSE-NEXT: orq %r11, %r10
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: orq %r10, %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT: andq 376(%rdi), %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 120(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 504(%rdi), %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 248(%rdi), %r8
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: movq 1056(%rsp,%rsi), %rax
-; SSE-NEXT: shldq %cl, %rax, %rbx
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: orq %r10, %r8
-; SSE-NEXT: orq %r9, %r8
-; SSE-NEXT: andq 256(%rdi), %rdx
-; SSE-NEXT: orq %r14, %r8
-; SSE-NEXT: andq (%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: orq %rbp, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: andq 264(%rdi), %rcx
-; SSE-NEXT: andq 8(%rdi), %rbx
-; SSE-NEXT: orq %rcx, %rbx
-; SSE-NEXT: orq %r12, %rbx
-; SSE-NEXT: orq %r13, %rbx
-; SSE-NEXT: orq %r15, %rbx
-; SSE-NEXT: orq %r8, %rbx
-; SSE-NEXT: orq %rax, %rbx
-; SSE-NEXT: setne %al
-; SSE-NEXT: addq $1576, %rsp # imm = 0x628
+; SSE-NEXT: movq 48(%rdi), %r11
+; SSE-NEXT: movq 40(%rdi), %r9
+; SSE-NEXT: movq 24(%rdi), %r8
+; SSE-NEXT: movq 16(%rdi), %rdx
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %rsi
+; SSE-NEXT: rep bsfq %rcx, %rax
+; SSE-NEXT: rep bsfq %rsi, %rbx
+; SSE-NEXT: addq $64, %rbx
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovneq %rax, %rbx
+; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: rep bsfq %r8, %r10
+; SSE-NEXT: addq $64, %r10
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovneq %rax, %r10
+; SSE-NEXT: movq 32(%rdi), %r14
+; SSE-NEXT: subq $-128, %r10
+; SSE-NEXT: movq %rcx, %rax
+; SSE-NEXT: orq %rsi, %rax
+; SSE-NEXT: cmovneq %rbx, %r10
+; SSE-NEXT: rep bsfq %r14, %rax
+; SSE-NEXT: rep bsfq %r9, %rbx
+; SSE-NEXT: addq $64, %rbx
+; SSE-NEXT: testq %r14, %r14
+; SSE-NEXT: cmovneq %rax, %rbx
+; SSE-NEXT: rep bsfq %r11, %r15
+; SSE-NEXT: movl $64, %eax
+; SSE-NEXT: rep bsfq 56(%rdi), %rax
+; SSE-NEXT: addq $64, %rax
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovneq %r15, %rax
+; SSE-NEXT: subq $-128, %rax
+; SSE-NEXT: orq %r9, %r14
+; SSE-NEXT: cmovneq %rbx, %rax
+; SSE-NEXT: addq $256, %rax # imm = 0x100
+; SSE-NEXT: orq %r8, %rsi
+; SSE-NEXT: orq %rdx, %rcx
+; SSE-NEXT: orq %rsi, %rcx
+; SSE-NEXT: cmovneq %r10, %rax
+; SSE-NEXT: movl $-2, %edx
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: roll %cl, %edx
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shrl $3, %ecx
+; SSE-NEXT: andl $60, %ecx
+; SSE-NEXT: andl %edx, (%rdi,%rcx)
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
; SSE-NEXT: popq %r14
; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
-; AVX2-LABEL: test_ne_i4096:
+; AVX2-LABEL: blsr_u512:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $1560, %rsp # imm = 0x618
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl %esi, %eax
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: negl %eax
-; AVX2-NEXT: movslq %eax, %rsi
-; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11
-; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %r11, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12
-; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %r12, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rax, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp
-; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rbp, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax
-; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rax, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10
-; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8
-; AVX2-NEXT: movq %r8, %rdx
-; AVX2-NEXT: shldq %cl, %r10, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx
-; AVX2-NEXT: movq %rbx, %rdx
-; AVX2-NEXT: shldq %cl, %r9, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9
-; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, %r14
-; AVX2-NEXT: shldq %cl, %r9, %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14
-; AVX2-NEXT: movq %r14, %r13
-; AVX2-NEXT: shldq %cl, %r15, %r13
-; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx
-; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r13
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, %r9
-; AVX2-NEXT: andq 384(%rdi), %r9
-; AVX2-NEXT: andq 128(%rdi), %r14
-; AVX2-NEXT: andq 320(%rdi), %r10
-; AVX2-NEXT: orq %r9, %r14
-; AVX2-NEXT: movq %r14, %r15
-; AVX2-NEXT: andq 64(%rdi), %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: andq 448(%rdi), %rbp
-; AVX2-NEXT: andq 192(%rdi), %r13
-; AVX2-NEXT: orq %rbp, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq 288(%rdi), %r8
-; AVX2-NEXT: andq 32(%rdi), %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 416(%rdi), %rax
-; AVX2-NEXT: orq %r8, %r12
-; AVX2-NEXT: andq 160(%rdi), %r11
-; AVX2-NEXT: orq %rax, %r11
-; AVX2-NEXT: andq 352(%rdi), %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 96(%rdi), %rax
-; AVX2-NEXT: orq %r12, %r11
-; AVX2-NEXT: orq %rbx, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 480(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: andq 224(%rdi), %r13
-; AVX2-NEXT: orq %r10, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 272(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 16(%rdi), %rax
-; AVX2-NEXT: orq %r11, %r13
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: andq 400(%rdi), %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 144(%rdi), %rax
-; AVX2-NEXT: orq %r9, %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 336(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 80(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 464(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: andq 208(%rdi), %r11
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: orq %r8, %r11
-; AVX2-NEXT: orq %rax, %r11
-; AVX2-NEXT: orq %r9, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: andq 304(%rdi), %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 48(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 432(%rdi), %r10
-; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload
-; AVX2-NEXT: andq 176(%rdi), %rax
-; AVX2-NEXT: orq %r9, %r8
-; AVX2-NEXT: movq %r8, %r9
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 368(%rdi), %r8
-; AVX2-NEXT: orq %r9, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 112(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 496(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: andq 240(%rdi), %r9
-; AVX2-NEXT: orq %r8, %r9
-; AVX2-NEXT: orq %rax, %r9
-; AVX2-NEXT: orq %r10, %r9
-; AVX2-NEXT: orq %r11, %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 392(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT: andq 136(%rdi), %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 328(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 72(%rdi), %rax
-; AVX2-NEXT: orq %r10, %rbp
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 456(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX2-NEXT: andq 200(%rdi), %r12
-; AVX2-NEXT: orq %rax, %r12
-; AVX2-NEXT: orq %r8, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 296(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 40(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: andq 424(%rdi), %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 168(%rdi), %rax
-; AVX2-NEXT: orq %r10, %r8
-; AVX2-NEXT: movq %r8, %r10
-; AVX2-NEXT: orq %r11, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 360(%rdi), %r8
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 104(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 488(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: andq 232(%rdi), %r14
-; AVX2-NEXT: orq %rax, %r14
-; AVX2-NEXT: orq %r8, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 280(%rdi), %r8
-; AVX2-NEXT: orq %r10, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 24(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 408(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 152(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: andq 344(%rdi), %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 88(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 472(%rdi), %rax
-; AVX2-NEXT: orq %r11, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: andq 216(%rdi), %rbx
-; AVX2-NEXT: orq %rax, %rbx
-; AVX2-NEXT: orq %r8, %rbx
-; AVX2-NEXT: orq %r10, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 312(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 56(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 440(%rdi), %r10
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 184(%rdi), %r8
-; AVX2-NEXT: orq %r10, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 376(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 120(%rdi), %rax
-; AVX2-NEXT: orq %r11, %r8
-; AVX2-NEXT: movq %r8, %r11
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 504(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 248(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r8, %r10
-; AVX2-NEXT: orq %r11, %rax
-; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi
-; AVX2-NEXT: orq %rbx, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: shlxq %rcx, %rsi, %rax
-; AVX2-NEXT: andq 256(%rdi), %r10
-; AVX2-NEXT: andq (%rdi), %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: orq %r15, %rax
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT: orq %r13, %rax
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %rsi, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: andq 264(%rdi), %rcx
-; AVX2-NEXT: andq 8(%rdi), %rdx
-; AVX2-NEXT: orq %r9, %rax
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: orq %rbp, %rdx
-; AVX2-NEXT: orq %r12, %rdx
-; AVX2-NEXT: orq %r14, %rdx
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: orq %rax, %rdx
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: addq $1560, %rsp # imm = 0x618
+; AVX2-NEXT: movq 40(%rdi), %r9
+; AVX2-NEXT: movq 32(%rdi), %r10
+; AVX2-NEXT: movq 24(%rdi), %r8
+; AVX2-NEXT: movq 16(%rdi), %rdx
+; AVX2-NEXT: movq (%rdi), %rcx
+; AVX2-NEXT: movq 8(%rdi), %rsi
+; AVX2-NEXT: tzcntq %rcx, %rax
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %rsi, %rbx
+; AVX2-NEXT: addq $64, %rbx
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovneq %rax, %rbx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %rdx, %rax
+; AVX2-NEXT: tzcntq %r8, %r11
+; AVX2-NEXT: addq $64, %r11
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovneq %rax, %r11
+; AVX2-NEXT: subq $-128, %r11
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: orq %rsi, %rax
+; AVX2-NEXT: cmovneq %rbx, %r11
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %r10, %rax
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %r9, %rbx
+; AVX2-NEXT: addq $64, %rbx
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovneq %rax, %rbx
+; AVX2-NEXT: movq 48(%rdi), %r14
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: tzcntq %r14, %r15
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq 56(%rdi), %rax
+; AVX2-NEXT: addq $64, %rax
+; AVX2-NEXT: testq %r14, %r14
+; AVX2-NEXT: cmovneq %r15, %rax
+; AVX2-NEXT: subq $-128, %rax
+; AVX2-NEXT: orq %r9, %r10
+; AVX2-NEXT: cmovneq %rbx, %rax
+; AVX2-NEXT: addq $256, %rax # imm = 0x100
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: cmovneq %r11, %rax
+; AVX2-NEXT: movl $-2, %edx
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: roll %cl, %edx
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $3, %ecx
+; AVX2-NEXT: andl $60, %ecx
+; AVX2-NEXT: andl %edx, (%rdi,%rcx)
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_ne_i4096:
+; AVX512-LABEL: blsr_u512:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $1560, %rsp # imm = 0x618
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl %esi, %eax
-; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %eax
-; AVX512-NEXT: negl %eax
-; AVX512-NEXT: movslq %eax, %rsi
-; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10
-; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14
-; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12
-; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %r12, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax
-; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11
-; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx
-; AVX512-NEXT: movq %rbx, %rdx
-; AVX512-NEXT: shldq %cl, %r11, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9
-; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8
-; AVX512-NEXT: movq %r8, %rdx
-; AVX512-NEXT: shldq %cl, %r9, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9
-; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, %r15
-; AVX512-NEXT: shldq %cl, %r9, %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp
-; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15
-; AVX512-NEXT: movq %r15, %r13
-; AVX512-NEXT: shldq %cl, %rbp, %r13
-; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx
-; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %rbx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r14
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r13
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbp, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbp, %r9
-; AVX512-NEXT: andq 384(%rdi), %r9
-; AVX512-NEXT: andq 128(%rdi), %r15
-; AVX512-NEXT: orq %r9, %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq 320(%rdi), %r11
-; AVX512-NEXT: andq 64(%rdi), %rax
-; AVX512-NEXT: orq %r11, %rax
-; AVX512-NEXT: andq 448(%rdi), %r12
-; AVX512-NEXT: andq 192(%rdi), %r13
-; AVX512-NEXT: orq %r12, %r13
-; AVX512-NEXT: orq %rax, %r13
-; AVX512-NEXT: andq 288(%rdi), %r8
-; AVX512-NEXT: andq 32(%rdi), %r14
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 416(%rdi), %rax
-; AVX512-NEXT: orq %r8, %r14
-; AVX512-NEXT: andq 160(%rdi), %r10
-; AVX512-NEXT: orq %rax, %r10
-; AVX512-NEXT: andq 352(%rdi), %rbx
-; AVX512-NEXT: orq %r14, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 96(%rdi), %rax
-; AVX512-NEXT: orq %rbx, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 480(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: andq 224(%rdi), %r15
-; AVX512-NEXT: orq %rax, %r15
-; AVX512-NEXT: orq %r8, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 272(%rdi), %r8
-; AVX512-NEXT: orq %r10, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 16(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 400(%rdi), %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 144(%rdi), %rax
-; AVX512-NEXT: orq %r9, %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 336(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 80(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 464(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: andq 208(%rdi), %r11
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: orq %r8, %r11
-; AVX512-NEXT: orq %rax, %r11
-; AVX512-NEXT: orq %r9, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 304(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 48(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 432(%rdi), %r9
-; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 176(%rdi), %r8
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: orq %r9, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 368(%rdi), %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 112(%rdi), %rax
-; AVX512-NEXT: orq %r10, %r8
-; AVX512-NEXT: movq %r8, %r10
-; AVX512-NEXT: orq %r9, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 496(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 240(%rdi), %r9
-; AVX512-NEXT: orq %r8, %r9
-; AVX512-NEXT: orq %rax, %r9
-; AVX512-NEXT: orq %r10, %r9
-; AVX512-NEXT: orq %r11, %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 392(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: andq 136(%rdi), %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 328(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 72(%rdi), %rax
-; AVX512-NEXT: orq %r10, %rbp
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 456(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT: andq 200(%rdi), %r12
-; AVX512-NEXT: orq %rax, %r12
-; AVX512-NEXT: orq %r8, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 296(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 40(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 424(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 168(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 360(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 104(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 488(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT: andq 232(%rdi), %r14
-; AVX512-NEXT: orq %rax, %r14
-; AVX512-NEXT: orq %r8, %r14
-; AVX512-NEXT: orq %r10, %r14
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 280(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 24(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 408(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 152(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: andq 344(%rdi), %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 88(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 472(%rdi), %rax
-; AVX512-NEXT: orq %r11, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: andq 216(%rdi), %rbx
-; AVX512-NEXT: orq %rax, %rbx
-; AVX512-NEXT: orq %r8, %rbx
-; AVX512-NEXT: orq %r10, %rbx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 312(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 56(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 440(%rdi), %r8
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 184(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 376(%rdi), %r8
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 120(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 504(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 248(%rdi), %r8
-; AVX512-NEXT: orq %rax, %r8
-; AVX512-NEXT: orq %r10, %r8
-; AVX512-NEXT: orq %r11, %r8
-; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rsi, %r10
-; AVX512-NEXT: orq %rbx, %r8
-; AVX512-NEXT: shlxq %rcx, %rax, %rsi
-; AVX512-NEXT: andq 256(%rdi), %r10
-; AVX512-NEXT: andq (%rdi), %rsi
-; AVX512-NEXT: orq %r10, %rsi
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT: orq %r13, %rsi
-; AVX512-NEXT: orq %r15, %rsi
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: orq %r9, %rsi
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 264(%rdi), %rax
-; AVX512-NEXT: andq 8(%rdi), %rdx
-; AVX512-NEXT: orq %rax, %rdx
-; AVX512-NEXT: orq %rbp, %rdx
-; AVX512-NEXT: orq %r12, %rdx
-; AVX512-NEXT: orq %r14, %rdx
-; AVX512-NEXT: orq %r8, %rdx
-; AVX512-NEXT: orq %rsi, %rdx
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: addq $1560, %rsp # imm = 0x618
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpandnq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512-NEXT: vpsubq %zmm1, %zmm2, %zmm1
+; AVX512-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: movl $-2, %edx
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: roll %cl, %edx
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shrl $3, %ecx
+; AVX512-NEXT: andl $60, %ecx
+; AVX512-NEXT: andl %edx, (%rdi,%rcx)
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %rem = and i32 %position, 4095
- %ofs = zext nneg i32 %rem to i4096
- %bit = shl nuw i4096 1, %ofs
- %ld = load i4096, ptr %word
- %test = and i4096 %ld, %bit
- %cmp = icmp ne i4096 %test, 0
- ret i1 %cmp
+ %ld = load i512, ptr %word
+ %tz = tail call range(i512 0, 513) i512 @llvm.cttz.i512(i512 %ld, i1 false)
+ %tz.cast = trunc nuw nsw i512 %tz to i32
+ %tz.mask = and i512 %tz, 511
+ %mask = shl nuw i512 1, %tz.mask
+ %mask.not = xor i512 %mask, -1
+ %blsr = and i512 %ld, %mask.not
+ store i512 %blsr, ptr %word
+ ret i32 %tz.cast
}
diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll
index e2db8d4..59eb776 100644
--- a/llvm/test/CodeGen/X86/build-vector-128.ll
+++ b/llvm/test/CodeGen/X86/build-vector-128.ll
@@ -410,6 +410,234 @@ define <16 x i8> @test_buildvector_v16i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
ret <16 x i8> %ins15
}
+; build vectors where integers operands are split (typically via legalization)
+
+define <4 x i32> @test_buildvector_v2i64_split_v4i32(i64 %a0, i64 %a1) nounwind {
+; SSE-32-LABEL: test_buildvector_v2i64_split_v4i32:
+; SSE-32: # %bb.0:
+; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
+; SSE-32-NEXT: retl
+;
+; SSE-64-LABEL: test_buildvector_v2i64_split_v4i32:
+; SSE-64: # %bb.0:
+; SSE-64-NEXT: movq %rsi, %xmm1
+; SSE-64-NEXT: movq %rdi, %xmm0
+; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-64-NEXT: retq
+;
+; AVX-32-LABEL: test_buildvector_v2i64_split_v4i32:
+; AVX-32: # %bb.0:
+; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
+; AVX-32-NEXT: retl
+;
+; AVX-64-LABEL: test_buildvector_v2i64_split_v4i32:
+; AVX-64: # %bb.0:
+; AVX-64-NEXT: vmovq %rsi, %xmm0
+; AVX-64-NEXT: vmovq %rdi, %xmm1
+; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX-64-NEXT: retq
+ %a0.lo = trunc i64 %a0 to i32
+ %a1.lo = trunc i64 %a1 to i32
+ %a0.shr = lshr i64 %a0, 32
+ %a1.shr = lshr i64 %a1, 32
+ %a0.hi = trunc i64 %a0.shr to i32
+ %a1.hi = trunc i64 %a1.shr to i32
+ %v0 = insertelement <4 x i32> poison, i32 %a0.lo, i64 0
+ %v1 = insertelement <4 x i32> %v0, i32 %a0.hi, i64 1
+ %v2 = insertelement <4 x i32> %v1, i32 %a1.lo, i64 2
+ %v3 = insertelement <4 x i32> %v2, i32 %a1.hi, i64 3
+ ret <4 x i32> %v3
+}
+
+define <8 x i16> @test_buildvector_v4i32_split_v8i16(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+; SSE-32-LABEL: test_buildvector_v4i32_split_v8i16:
+; SSE-32: # %bb.0:
+; SSE-32-NEXT: movups {{[0-9]+}}(%esp), %xmm0
+; SSE-32-NEXT: retl
+;
+; SSE2-64-LABEL: test_buildvector_v4i32_split_v8i16:
+; SSE2-64: # %bb.0:
+; SSE2-64-NEXT: movd %ecx, %xmm0
+; SSE2-64-NEXT: movd %edx, %xmm1
+; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-64-NEXT: movd %esi, %xmm2
+; SSE2-64-NEXT: movd %edi, %xmm0
+; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-64-NEXT: retq
+;
+; SSE41-64-LABEL: test_buildvector_v4i32_split_v8i16:
+; SSE41-64: # %bb.0:
+; SSE41-64-NEXT: movd %edi, %xmm0
+; SSE41-64-NEXT: pinsrd $1, %esi, %xmm0
+; SSE41-64-NEXT: pinsrd $2, %edx, %xmm0
+; SSE41-64-NEXT: pinsrd $3, %ecx, %xmm0
+; SSE41-64-NEXT: retq
+;
+; AVX-32-LABEL: test_buildvector_v4i32_split_v8i16:
+; AVX-32: # %bb.0:
+; AVX-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
+; AVX-32-NEXT: retl
+;
+; AVX-64-LABEL: test_buildvector_v4i32_split_v8i16:
+; AVX-64: # %bb.0:
+; AVX-64-NEXT: vmovd %edi, %xmm0
+; AVX-64-NEXT: vpinsrd $1, %esi, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT: retq
+ %a0.lo = trunc i32 %a0 to i16
+ %a1.lo = trunc i32 %a1 to i16
+ %a2.lo = trunc i32 %a2 to i16
+ %a3.lo = trunc i32 %a3 to i16
+ %a0.shr = lshr i32 %a0, 16
+ %a1.shr = lshr i32 %a1, 16
+ %a2.shr = lshr i32 %a2, 16
+ %a3.shr = lshr i32 %a3, 16
+ %a0.hi = trunc i32 %a0.shr to i16
+ %a1.hi = trunc i32 %a1.shr to i16
+ %a2.hi = trunc i32 %a2.shr to i16
+ %a3.hi = trunc i32 %a3.shr to i16
+ %v0 = insertelement <8 x i16> poison, i16 %a0.lo, i64 0
+ %v1 = insertelement <8 x i16> %v0, i16 %a0.hi, i64 1
+ %v2 = insertelement <8 x i16> %v1, i16 %a1.lo, i64 2
+ %v3 = insertelement <8 x i16> %v2, i16 %a1.hi, i64 3
+ %v4 = insertelement <8 x i16> %v3, i16 %a2.lo, i64 4
+ %v5 = insertelement <8 x i16> %v4, i16 %a2.hi, i64 5
+ %v6 = insertelement <8 x i16> %v5, i16 %a3.lo, i64 6
+ %v7 = insertelement <8 x i16> %v6, i16 %a3.hi, i64 7
+ ret <8 x i16> %v7
+}
+
+define <16 x i8> @test_buildvector_v8i16_split_v16i8(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind {
+; SSE2-32-LABEL: test_buildvector_v8i16_split_v16i8:
+; SSE2-32: # %bb.0:
+; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-32-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-32-NEXT: retl
+;
+; SSE2-64-LABEL: test_buildvector_v8i16_split_v16i8:
+; SSE2-64: # %bb.0:
+; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-64-NEXT: movd %r9d, %xmm0
+; SSE2-64-NEXT: movd %r8d, %xmm2
+; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-64-NEXT: movd %ecx, %xmm0
+; SSE2-64-NEXT: movd %edx, %xmm1
+; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-64-NEXT: movd %esi, %xmm3
+; SSE2-64-NEXT: movd %edi, %xmm0
+; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-64-NEXT: retq
+;
+; SSE41-32-LABEL: test_buildvector_v8i16_split_v16i8:
+; SSE41-32: # %bb.0:
+; SSE41-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-32-NEXT: pinsrw $1, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT: pinsrw $2, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT: pinsrw $4, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT: pinsrw $5, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT: pinsrw $6, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT: pinsrw $7, {{[0-9]+}}(%esp), %xmm0
+; SSE41-32-NEXT: retl
+;
+; SSE41-64-LABEL: test_buildvector_v8i16_split_v16i8:
+; SSE41-64: # %bb.0:
+; SSE41-64-NEXT: movd %edi, %xmm0
+; SSE41-64-NEXT: pinsrw $1, %esi, %xmm0
+; SSE41-64-NEXT: pinsrw $2, %edx, %xmm0
+; SSE41-64-NEXT: pinsrw $3, %ecx, %xmm0
+; SSE41-64-NEXT: pinsrw $4, %r8d, %xmm0
+; SSE41-64-NEXT: pinsrw $5, %r9d, %xmm0
+; SSE41-64-NEXT: pinsrw $6, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-64-NEXT: pinsrw $7, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-64-NEXT: retq
+;
+; AVX-32-LABEL: test_buildvector_v8i16_split_v16i8:
+; AVX-32: # %bb.0:
+; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $6, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: vpinsrw $7, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT: retl
+;
+; AVX-64-LABEL: test_buildvector_v8i16_split_v16i8:
+; AVX-64: # %bb.0:
+; AVX-64-NEXT: vmovd %edi, %xmm0
+; AVX-64-NEXT: vpinsrw $1, %esi, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $2, %edx, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $3, %ecx, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $4, %r8d, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $5, %r9d, %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $6, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: vpinsrw $7, {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-64-NEXT: retq
+ %a0.lo = trunc i16 %a0 to i8
+ %a1.lo = trunc i16 %a1 to i8
+ %a2.lo = trunc i16 %a2 to i8
+ %a3.lo = trunc i16 %a3 to i8
+ %a4.lo = trunc i16 %a4 to i8
+ %a5.lo = trunc i16 %a5 to i8
+ %a6.lo = trunc i16 %a6 to i8
+ %a7.lo = trunc i16 %a7 to i8
+ %a0.shr = lshr i16 %a0, 8
+ %a1.shr = lshr i16 %a1, 8
+ %a2.shr = lshr i16 %a2, 8
+ %a3.shr = lshr i16 %a3, 8
+ %a4.shr = lshr i16 %a4, 8
+ %a5.shr = lshr i16 %a5, 8
+ %a6.shr = lshr i16 %a6, 8
+ %a7.shr = lshr i16 %a7, 8
+ %a0.hi = trunc i16 %a0.shr to i8
+ %a1.hi = trunc i16 %a1.shr to i8
+ %a2.hi = trunc i16 %a2.shr to i8
+ %a3.hi = trunc i16 %a3.shr to i8
+ %a4.hi = trunc i16 %a4.shr to i8
+ %a5.hi = trunc i16 %a5.shr to i8
+ %a6.hi = trunc i16 %a6.shr to i8
+ %a7.hi = trunc i16 %a7.shr to i8
+ %v0 = insertelement <16 x i8> poison, i8 %a0.lo, i64 0
+ %v1 = insertelement <16 x i8> %v0, i8 %a0.hi, i64 1
+ %v2 = insertelement <16 x i8> %v1, i8 %a1.lo, i64 2
+ %v3 = insertelement <16 x i8> %v2, i8 %a1.hi, i64 3
+ %v4 = insertelement <16 x i8> %v3, i8 %a2.lo, i64 4
+ %v5 = insertelement <16 x i8> %v4, i8 %a2.hi, i64 5
+ %v6 = insertelement <16 x i8> %v5, i8 %a3.lo, i64 6
+ %v7 = insertelement <16 x i8> %v6, i8 %a3.hi, i64 7
+ %v8 = insertelement <16 x i8> %v7, i8 %a4.lo, i64 8
+ %v9 = insertelement <16 x i8> %v8, i8 %a4.hi, i64 9
+ %v10 = insertelement <16 x i8> %v9, i8 %a5.lo, i64 10
+ %v11 = insertelement <16 x i8> %v10, i8 %a5.hi, i64 11
+ %v12 = insertelement <16 x i8> %v11, i8 %a6.lo, i64 12
+ %v13 = insertelement <16 x i8> %v12, i8 %a6.hi, i64 13
+ %v14 = insertelement <16 x i8> %v13, i8 %a7.lo, i64 14
+ %v15 = insertelement <16 x i8> %v14, i8 %a7.hi, i64 15
+ ret <16 x i8> %v15
+}
+
; build vectors of repeated elements
define <4 x float> @test_buildvector_4f32_2_var(float %a0, float %a1) {
diff --git a/llvm/test/CodeGen/X86/build-vector-256.ll b/llvm/test/CodeGen/X86/build-vector-256.ll
index 3edb712..773eb8f 100644
--- a/llvm/test/CodeGen/X86/build-vector-256.ll
+++ b/llvm/test/CodeGen/X86/build-vector-256.ll
@@ -417,9 +417,8 @@ define <32 x i8> @test_buildvector_v32i8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4,
define <4 x double> @test_buildvector_4f64_2_var(double %a0, double %a1) {
; AVX1-32-LABEL: test_buildvector_4f64_2_var:
; AVX1-32: # %bb.0:
-; AVX1-32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0
-; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT: vmovupd {{[0-9]+}}(%esp), %xmm0
+; AVX1-32-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-32-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index f36baba..ab8498d 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
}
; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
entry:
%sink = alloca i32, align 4
@@ -33,6 +32,6 @@ entry:
;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
; CHECK-NEXT: .byte 1
;; Function Entry PC
-; CHECK-NEXT: .quad [[LABEL_FUNC]]
+; CHECK-NEXT: .quad _ZL10myCallbacki
;; Function type ID
; CHECK-NEXT: .quad -5212364466660467813
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index cdbad66..02d7107 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
declare !type !2 ptr @direct_baz(ptr)
; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
define ptr @ball() {
entry:
call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
;; Flags
; CHECK-NEXT: .byte 7
;; Function Entry PC
-; CHECK-NEXT: .quad [[LABEL_FUNC]]
+; CHECK-NEXT: .quad ball
;; Function type ID -- set to 0 as no type metadata attached to function.
; CHECK-NEXT: .quad 0
;; Number of unique direct callees.
diff --git a/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-loc.mir b/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-loc.mir
index ef9fb22..8211f89 100644
--- a/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-loc.mir
+++ b/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-loc.mir
@@ -1,4 +1,3 @@
-# REQUIRES: asserts
# RUN: not --crash llc -o - %s -mtriple=x86_64-- \
# RUN: -run-pass=cfi-instr-inserter 2>&1 | FileCheck %s
# Test that CSR being saved in multiple locations can be caught by
@@ -10,8 +9,7 @@
}
...
---
-# CHECK: Different saved locations for the same CSR
-# CHECK-NEXT: UNREACHABLE executed
+# CHECK: LLVM ERROR: Different saved locations for the same CSR
name: inconsistentlocs
body: |
bb.0:
diff --git a/llvm/test/CodeGen/X86/chain_order.ll b/llvm/test/CodeGen/X86/chain_order.ll
index 3ced27f..18faec5 100644
--- a/llvm/test/CodeGen/X86/chain_order.ll
+++ b/llvm/test/CodeGen/X86/chain_order.ll
@@ -6,9 +6,8 @@ define void @cftx020(ptr nocapture %a) {
; CHECK-LABEL: cftx020:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; CHECK-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = mem[1,0]
; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vmovupd (%rdi), %xmm1
; CHECK-NEXT: vmovupd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll
index 4d41c84..a42a715 100644
--- a/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll
+++ b/llvm/test/CodeGen/X86/coalescer-dead-flag-verifier-error.ll
@@ -7,8 +7,8 @@
define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0_(ptr %r) {
; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0_:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .p2align 4
@@ -68,8 +68,8 @@ _ZNK4llvm5APInt13getActiveBitsEv.exit.i.i: ; preds = %for.body.i.i.i.i.i
define void @_ZNK4llvm5APInt21multiplicativeInverseERKS0__assert(ptr %r) {
; CHECK-LABEL: _ZNK4llvm5APInt21multiplicativeInverseERKS0__assert:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: jmp .LBB1_1
; CHECK-NEXT: .p2align 4
diff --git a/llvm/test/CodeGen/X86/combine-fceil.ll b/llvm/test/CodeGen/X86/combine-fceil.ll
new file mode 100644
index 0000000..a3f55e8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fceil.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_ceil_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_ceil_v4f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $10, %xmm0, %xmm0
+; SSE-NEXT: roundpd $10, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_ceil_v4f64_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $10, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1)
+ %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @concat_ceil_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_ceil_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $10, %xmm0, %xmm0
+; SSE-NEXT: roundps $10, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_ceil_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundps $10, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x double> @concat_ceil_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_ceil_v8f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $10, %xmm0, %xmm0
+; SSE-NEXT: roundpd $10, %xmm1, %xmm1
+; SSE-NEXT: roundpd $10, %xmm2, %xmm2
+; SSE-NEXT: roundpd $10, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_ceil_v8f64_v2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $10, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundpd $10, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_ceil_v8f64_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $10, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundpd $10, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_ceil_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a1)
+ %v2 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a2)
+ %v3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a3)
+ %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_ceil_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_ceil_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $10, %xmm0, %xmm0
+; SSE-NEXT: roundps $10, %xmm1, %xmm1
+; SSE-NEXT: roundps $10, %xmm2, %xmm2
+; SSE-NEXT: roundps $10, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_ceil_v16f32_v4f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $10, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundps $10, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_ceil_v16f32_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $10, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundps $10, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_ceil_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+define <8 x double> @concat_ceil_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_ceil_v8f64_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $10, %xmm0, %xmm0
+; SSE-NEXT: roundpd $10, %xmm1, %xmm1
+; SSE-NEXT: roundpd $10, %xmm2, %xmm2
+; SSE-NEXT: roundpd $10, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_ceil_v8f64_v4f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundpd $10, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundpd $10, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_ceil_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a0)
+ %v1 = call <4 x double> @llvm.ceil.v4f64(<4 x double> %a1)
+ %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_ceil_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_ceil_v16f32_v8f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $10, %xmm0, %xmm0
+; SSE-NEXT: roundps $10, %xmm1, %xmm1
+; SSE-NEXT: roundps $10, %xmm2, %xmm2
+; SSE-NEXT: roundps $10, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_ceil_v16f32_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundps $10, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundps $10, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_ceil_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a0)
+ %v1 = call <8 x float> @llvm.ceil.v8f32(<8 x float> %a1)
+ %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-fcmp.ll b/llvm/test/CodeGen/X86/combine-fcmp.ll
new file mode 100644
index 0000000..f2666f6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fcmp.ll
@@ -0,0 +1,330 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define i4 @concat_fcmp_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_fcmp_v4f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: xorpd %xmm2, %xmm2
+; SSE-NEXT: xorpd %xmm3, %xmm3
+; SSE-NEXT: cmpltpd %xmm0, %xmm3
+; SSE-NEXT: cmpltpd %xmm1, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2]
+; SSE-NEXT: movmskps %xmm3, %eax
+; SSE-NEXT: # kill: def $al killed $al killed $eax
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_fcmp_v4f64_v2f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT: vcmpltpd %xmm0, %xmm2, %xmm0
+; AVX1OR2-NEXT: vcmpltpd %xmm1, %xmm2, %xmm1
+; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX1OR2-NEXT: vmovmskps %xmm0, %eax
+; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_fcmp_v4f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vcmpltpd %xmm0, %xmm2, %k0
+; AVX512-NEXT: vcmpltpd %xmm1, %xmm2, %k1
+; AVX512-NEXT: kshiftlb $2, %k1, %k1
+; AVX512-NEXT: korw %k1, %k0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
+ %v0 = fcmp ogt <2 x double> %a0, zeroinitializer
+ %v1 = fcmp ogt <2 x double> %a1, zeroinitializer
+ %v = shufflevector <2 x i1> %v0, <2 x i1> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r = bitcast <4 x i1> %v to i4
+ ret i4 %r
+}
+
+define i8 @concat_fcmp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_fcmp_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: xorps %xmm2, %xmm2
+; SSE-NEXT: cmpeqps %xmm2, %xmm0
+; SSE-NEXT: cmpeqps %xmm2, %xmm1
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: def $al killed $al killed $eax
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_fcmp_v8f32_v4f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT: vcmpeqps %xmm2, %xmm0, %xmm0
+; AVX1OR2-NEXT: vcmpeqps %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_fcmp_v8f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcmpeqps %ymm1, %ymm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = fcmp oeq <4 x float> %a0, zeroinitializer
+ %v1 = fcmp oeq <4 x float> %a1, zeroinitializer
+ %v = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r = bitcast <8 x i1> %v to i8
+ ret i8 %r
+}
+
+define i8 @concat_fcmp_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_fcmp_v8f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: xorpd %xmm4, %xmm4
+; SSE-NEXT: cmpltpd %xmm4, %xmm0
+; SSE-NEXT: cmpltpd %xmm4, %xmm1
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: cmpltpd %xmm4, %xmm2
+; SSE-NEXT: cmpltpd %xmm4, %xmm3
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm0, %xmm0
+; SSE-NEXT: packssdw %xmm2, %xmm2
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: def $al killed $al killed $eax
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_fcmp_v8f64_v2f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; AVX1OR2-NEXT: vcmpltpd %xmm4, %xmm0, %xmm0
+; AVX1OR2-NEXT: vcmpltpd %xmm4, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vcmpltpd %xmm4, %xmm2, %xmm1
+; AVX1OR2-NEXT: vcmpltpd %xmm4, %xmm3, %xmm2
+; AVX1OR2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
+; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_fcmp_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcmpltpd %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = fcmp olt <2 x double> %a0, zeroinitializer
+ %v1 = fcmp olt <2 x double> %a1, zeroinitializer
+ %v2 = fcmp olt <2 x double> %a2, zeroinitializer
+ %v3 = fcmp olt <2 x double> %a3, zeroinitializer
+ %v01 = shufflevector <2 x i1> %v0, <2 x i1> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %v23 = shufflevector <2 x i1> %v2, <2 x i1> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %v = shufflevector <4 x i1> %v01, <4 x i1> %v23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r = bitcast <8 x i1> %v to i8
+ ret i8 %r
+}
+
+define i16 @concat_fcmp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_fcmp_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: xorps %xmm4, %xmm4
+; SSE-NEXT: xorps %xmm5, %xmm5
+; SSE-NEXT: cmpleps %xmm0, %xmm5
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cmpleps %xmm1, %xmm0
+; SSE-NEXT: packssdw %xmm0, %xmm5
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: cmpleps %xmm2, %xmm0
+; SSE-NEXT: cmpleps %xmm3, %xmm4
+; SSE-NEXT: packssdw %xmm4, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm5
+; SSE-NEXT: pmovmskb %xmm5, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_fcmp_v16f32_v4f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; AVX1OR2-NEXT: vcmpleps %xmm0, %xmm4, %xmm0
+; AVX1OR2-NEXT: vcmpleps %xmm1, %xmm4, %xmm1
+; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vcmpleps %xmm2, %xmm4, %xmm1
+; AVX1OR2-NEXT: vcmpleps %xmm3, %xmm4, %xmm2
+; AVX1OR2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_fcmp_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcmpleps %zmm0, %zmm1, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = fcmp oge <4 x float> %a0, zeroinitializer
+ %v1 = fcmp oge <4 x float> %a1, zeroinitializer
+ %v2 = fcmp oge <4 x float> %a2, zeroinitializer
+ %v3 = fcmp oge <4 x float> %a3, zeroinitializer
+ %v01 = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v23 = shufflevector <4 x i1> %v2, <4 x i1> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v = shufflevector <8 x i1> %v01, <8 x i1> %v23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %r = bitcast <16 x i1> %v to i16
+ ret i16 %r
+}
+
+define i8 @concat_fcmp_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_fcmp_v8f64_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: xorpd %xmm4, %xmm4
+; SSE-NEXT: movapd %xmm1, %xmm5
+; SSE-NEXT: cmpneqpd %xmm4, %xmm5
+; SSE-NEXT: cmpordpd %xmm4, %xmm1
+; SSE-NEXT: andpd %xmm5, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: movapd %xmm0, %xmm5
+; SSE-NEXT: cmpneqpd %xmm4, %xmm5
+; SSE-NEXT: cmpordpd %xmm4, %xmm0
+; SSE-NEXT: andpd %xmm5, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT: movapd %xmm3, %xmm1
+; SSE-NEXT: cmpneqpd %xmm4, %xmm1
+; SSE-NEXT: cmpordpd %xmm4, %xmm3
+; SSE-NEXT: andpd %xmm1, %xmm3
+; SSE-NEXT: movapd %xmm2, %xmm1
+; SSE-NEXT: cmpneqpd %xmm4, %xmm1
+; SSE-NEXT: cmpordpd %xmm4, %xmm2
+; SSE-NEXT: andpd %xmm1, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packssdw %xmm2, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: def $al killed $al killed $eax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_fcmp_v8f64_v4f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; AVX1-NEXT: vcmpneq_oqpd %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_fcmp_v8f64_v4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vcmpneq_oqpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; AVX2-NEXT: vcmpneq_oqpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_fcmp_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = fcmp one <4 x double> %a0, zeroinitializer
+ %v1 = fcmp one <4 x double> %a1, zeroinitializer
+ %v = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r = bitcast <8 x i1> %v to i8
+ ret i8 %r
+}
+
+define i16 @concat_fcmp_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_fcmp_v16f32_v8f32:
+; SSE: # %bb.0:
+; SSE-NEXT: xorps %xmm4, %xmm4
+; SSE-NEXT: cmpleps %xmm4, %xmm1
+; SSE-NEXT: cmpleps %xmm4, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: cmpleps %xmm4, %xmm3
+; SSE-NEXT: cmpleps %xmm4, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_fcmp_v16f32_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT: vcmpleps %ymm2, %ymm0, %ymm0
+; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1OR2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1OR2-NEXT: vcmpleps %ymm2, %ymm1, %ymm1
+; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1OR2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1OR2-NEXT: vzeroupper
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_fcmp_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vcmpleps %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = fcmp ole <8 x float> %a0, zeroinitializer
+ %v1 = fcmp ole <8 x float> %a1, zeroinitializer
+ %v = shufflevector <8 x i1> %v0, <8 x i1> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %r = bitcast <16 x i1> %v to i16
+ ret i16 %r
+}
diff --git a/llvm/test/CodeGen/X86/combine-ffloor.ll b/llvm/test/CodeGen/X86/combine-ffloor.ll
new file mode 100644
index 0000000..5cde95e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-ffloor.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_floor_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_floor_v4f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $9, %xmm0, %xmm0
+; SSE-NEXT: roundpd $9, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_floor_v4f64_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $9, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a1)
+ %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @concat_floor_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_floor_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $9, %xmm0, %xmm0
+; SSE-NEXT: roundps $9, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_floor_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundps $9, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x double> @concat_floor_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_floor_v8f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $9, %xmm0, %xmm0
+; SSE-NEXT: roundpd $9, %xmm1, %xmm1
+; SSE-NEXT: roundpd $9, %xmm2, %xmm2
+; SSE-NEXT: roundpd $9, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_floor_v8f64_v2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $9, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundpd $9, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_floor_v8f64_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $9, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundpd $9, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_floor_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a1)
+ %v2 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a2)
+ %v3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %a3)
+ %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_floor_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_floor_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $9, %xmm0, %xmm0
+; SSE-NEXT: roundps $9, %xmm1, %xmm1
+; SSE-NEXT: roundps $9, %xmm2, %xmm2
+; SSE-NEXT: roundps $9, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_floor_v16f32_v4f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $9, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundps $9, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_floor_v16f32_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $9, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundps $9, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_floor_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+define <8 x double> @concat_floor_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_floor_v8f64_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $9, %xmm0, %xmm0
+; SSE-NEXT: roundpd $9, %xmm1, %xmm1
+; SSE-NEXT: roundpd $9, %xmm2, %xmm2
+; SSE-NEXT: roundpd $9, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_floor_v8f64_v4f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundpd $9, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundpd $9, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_floor_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $9, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x double> @llvm.floor.v4f64(<4 x double> %a0)
+ %v1 = call <4 x double> @llvm.floor.v4f64(<4 x double> %a1)
+ %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_floor_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_floor_v16f32_v8f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $9, %xmm0, %xmm0
+; SSE-NEXT: roundps $9, %xmm1, %xmm1
+; SSE-NEXT: roundps $9, %xmm2, %xmm2
+; SSE-NEXT: roundps $9, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_floor_v16f32_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundps $9, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundps $9, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_floor_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $9, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <8 x float> @llvm.floor.v8f32(<8 x float> %a0)
+ %v1 = call <8 x float> @llvm.floor.v8f32(<8 x float> %a1)
+ %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-fnearbyint.ll b/llvm/test/CodeGen/X86/combine-fnearbyint.ll
new file mode 100644
index 0000000..fde136a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fnearbyint.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_nearbyint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_nearbyint_v4f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $12, %xmm0, %xmm0
+; SSE-NEXT: roundpd $12, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_nearbyint_v4f64_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $12, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1)
+ %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @concat_nearbyint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_nearbyint_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $12, %xmm0, %xmm0
+; SSE-NEXT: roundps $12, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_nearbyint_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundps $12, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x double> @concat_nearbyint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_nearbyint_v8f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $12, %xmm0, %xmm0
+; SSE-NEXT: roundpd $12, %xmm1, %xmm1
+; SSE-NEXT: roundpd $12, %xmm2, %xmm2
+; SSE-NEXT: roundpd $12, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_nearbyint_v8f64_v2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $12, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundpd $12, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_nearbyint_v8f64_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $12, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundpd $12, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_nearbyint_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a1)
+ %v2 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a2)
+ %v3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a3)
+ %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_nearbyint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_nearbyint_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $12, %xmm0, %xmm0
+; SSE-NEXT: roundps $12, %xmm1, %xmm1
+; SSE-NEXT: roundps $12, %xmm2, %xmm2
+; SSE-NEXT: roundps $12, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_nearbyint_v16f32_v4f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $12, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundps $12, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_nearbyint_v16f32_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $12, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundps $12, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_nearbyint_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+define <8 x double> @concat_nearbyint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_nearbyint_v8f64_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $12, %xmm0, %xmm0
+; SSE-NEXT: roundpd $12, %xmm1, %xmm1
+; SSE-NEXT: roundpd $12, %xmm2, %xmm2
+; SSE-NEXT: roundpd $12, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v8f64_v4f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundpd $12, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundpd $12, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_nearbyint_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $12, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a0)
+ %v1 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %a1)
+ %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_nearbyint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_nearbyint_v16f32_v8f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $12, %xmm0, %xmm0
+; SSE-NEXT: roundps $12, %xmm1, %xmm1
+; SSE-NEXT: roundps $12, %xmm2, %xmm2
+; SSE-NEXT: roundps $12, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_nearbyint_v16f32_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundps $12, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundps $12, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_nearbyint_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $12, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a0)
+ %v1 = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %a1)
+ %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-frint.ll b/llvm/test/CodeGen/X86/combine-frint.ll
new file mode 100644
index 0000000..1c52529
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-frint.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_rint_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_rint_v4f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $4, %xmm0, %xmm0
+; SSE-NEXT: roundpd $4, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_rint_v4f64_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1)
+ %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @concat_rint_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rint_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $4, %xmm0, %xmm0
+; SSE-NEXT: roundps $4, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_rint_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x double> @concat_rint_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_rint_v8f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $4, %xmm0, %xmm0
+; SSE-NEXT: roundpd $4, %xmm1, %xmm1
+; SSE-NEXT: roundpd $4, %xmm2, %xmm2
+; SSE-NEXT: roundpd $4, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_rint_v8f64_v2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundpd $4, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_rint_v8f64_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundpd $4, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_rint_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a1)
+ %v2 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a2)
+ %v3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %a3)
+ %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_rint_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rint_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $4, %xmm0, %xmm0
+; SSE-NEXT: roundps $4, %xmm1, %xmm1
+; SSE-NEXT: roundps $4, %xmm2, %xmm2
+; SSE-NEXT: roundps $4, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_rint_v16f32_v4f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundps $4, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_rint_v16f32_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundps $4, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_rint_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+define <8 x double> @concat_rint_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_rint_v8f64_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $4, %xmm0, %xmm0
+; SSE-NEXT: roundpd $4, %xmm1, %xmm1
+; SSE-NEXT: roundpd $4, %xmm2, %xmm2
+; SSE-NEXT: roundpd $4, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_rint_v8f64_v4f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundpd $4, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_rint_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a0)
+ %v1 = call <4 x double> @llvm.rint.v4f64(<4 x double> %a1)
+ %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_rint_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_rint_v16f32_v8f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $4, %xmm0, %xmm0
+; SSE-NEXT: roundps $4, %xmm1, %xmm1
+; SSE-NEXT: roundps $4, %xmm2, %xmm2
+; SSE-NEXT: roundps $4, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_rint_v16f32_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundps $4, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_rint_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a0)
+ %v1 = call <8 x float> @llvm.rint.v8f32(<8 x float> %a1)
+ %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-fround.ll b/llvm/test/CodeGen/X86/combine-fround.ll
new file mode 100644
index 0000000..42dbaf2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fround.ll
@@ -0,0 +1,419 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_round_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_round_v4f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: movapd {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0]
+; SSE-NEXT: movapd %xmm0, %xmm3
+; SSE-NEXT: andpd %xmm2, %xmm3
+; SSE-NEXT: movapd {{.*#+}} xmm4 = [4.9999999999999994E-1,4.9999999999999994E-1]
+; SSE-NEXT: orpd %xmm4, %xmm3
+; SSE-NEXT: addpd %xmm0, %xmm3
+; SSE-NEXT: roundpd $11, %xmm3, %xmm0
+; SSE-NEXT: andpd %xmm1, %xmm2
+; SSE-NEXT: orpd %xmm4, %xmm2
+; SSE-NEXT: addpd %xmm1, %xmm2
+; SSE-NEXT: roundpd $11, %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_round_v4f64_v2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; AVX1-NEXT: vorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_round_v4f64_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX2-NEXT: vandpd %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX2-NEXT: vorpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_round_v4f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & m64bcst)
+; AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.round.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.round.v2f64(<2 x double> %a1)
+ %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @concat_round_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_round_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: andps %xmm2, %xmm3
+; SSE-NEXT: movaps {{.*#+}} xmm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; SSE-NEXT: orps %xmm4, %xmm3
+; SSE-NEXT: addps %xmm0, %xmm3
+; SSE-NEXT: roundps $11, %xmm3, %xmm0
+; SSE-NEXT: andps %xmm1, %xmm2
+; SSE-NEXT: orps %xmm4, %xmm2
+; SSE-NEXT: addps %xmm1, %xmm2
+; SSE-NEXT: roundps $11, %xmm2, %xmm1
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_round_v8f32_v4f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
+; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_round_v8f32_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm1
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_round_v8f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpternlogd {{.*#+}} ymm2 = ymm2 | (ymm0 & m32bcst)
+; AVX512-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.round.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.round.v4f32(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x double> @concat_round_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_round_v8f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0]
+; SSE-NEXT: movapd %xmm0, %xmm5
+; SSE-NEXT: andpd %xmm4, %xmm5
+; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.9999999999999994E-1,4.9999999999999994E-1]
+; SSE-NEXT: orpd %xmm6, %xmm5
+; SSE-NEXT: addpd %xmm0, %xmm5
+; SSE-NEXT: roundpd $11, %xmm5, %xmm0
+; SSE-NEXT: movapd %xmm1, %xmm5
+; SSE-NEXT: andpd %xmm4, %xmm5
+; SSE-NEXT: orpd %xmm6, %xmm5
+; SSE-NEXT: addpd %xmm1, %xmm5
+; SSE-NEXT: roundpd $11, %xmm5, %xmm1
+; SSE-NEXT: movapd %xmm2, %xmm5
+; SSE-NEXT: andpd %xmm4, %xmm5
+; SSE-NEXT: orpd %xmm6, %xmm5
+; SSE-NEXT: addpd %xmm2, %xmm5
+; SSE-NEXT: roundpd $11, %xmm5, %xmm2
+; SSE-NEXT: andpd %xmm3, %xmm4
+; SSE-NEXT: orpd %xmm6, %xmm4
+; SSE-NEXT: addpd %xmm3, %xmm4
+; SSE-NEXT: roundpd $11, %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_round_v8f64_v2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm4
+; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX1-NEXT: vorpd %ymm5, %ymm4, %ymm4
+; AVX1-NEXT: vaddpd %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorpd %ymm5, %ymm1, %ymm1
+; AVX1-NEXT: vaddpd %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_round_v8f64_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX2-NEXT: vandpd %ymm1, %ymm0, %ymm4
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX2-NEXT: vorpd %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vaddpd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vandpd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vorpd %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vaddpd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vroundpd $11, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_round_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & m64bcst)
+; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.round.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.round.v2f64(<2 x double> %a1)
+ %v2 = call <2 x double> @llvm.round.v2f64(<2 x double> %a2)
+ %v3 = call <2 x double> @llvm.round.v2f64(<2 x double> %a3)
+ %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_round_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_round_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE-NEXT: movaps %xmm0, %xmm5
+; SSE-NEXT: andps %xmm4, %xmm5
+; SSE-NEXT: movaps {{.*#+}} xmm6 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; SSE-NEXT: orps %xmm6, %xmm5
+; SSE-NEXT: addps %xmm0, %xmm5
+; SSE-NEXT: roundps $11, %xmm5, %xmm0
+; SSE-NEXT: movaps %xmm1, %xmm5
+; SSE-NEXT: andps %xmm4, %xmm5
+; SSE-NEXT: orps %xmm6, %xmm5
+; SSE-NEXT: addps %xmm1, %xmm5
+; SSE-NEXT: roundps $11, %xmm5, %xmm1
+; SSE-NEXT: movaps %xmm2, %xmm5
+; SSE-NEXT: andps %xmm4, %xmm5
+; SSE-NEXT: orps %xmm6, %xmm5
+; SSE-NEXT: addps %xmm2, %xmm5
+; SSE-NEXT: roundps $11, %xmm5, %xmm2
+; SSE-NEXT: andps %xmm3, %xmm4
+; SSE-NEXT: orps %xmm6, %xmm4
+; SSE-NEXT: addps %xmm3, %xmm4
+; SSE-NEXT: roundps $11, %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_round_v16f32_v4f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm4
+; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX1-NEXT: vorps %ymm5, %ymm4, %ymm4
+; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
+; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT: vroundps $11, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_round_v16f32_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm4
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX2-NEXT: vorps %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vandps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vorps %ymm5, %ymm1, %ymm1
+; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vroundps $11, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_round_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm0 & m32bcst)
+; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.round.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.round.v4f32(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.round.v4f32(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.round.v4f32(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+define <8 x double> @concat_round_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_round_v8f64_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: movapd {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0]
+; SSE-NEXT: movapd %xmm0, %xmm5
+; SSE-NEXT: andpd %xmm4, %xmm5
+; SSE-NEXT: movapd {{.*#+}} xmm6 = [4.9999999999999994E-1,4.9999999999999994E-1]
+; SSE-NEXT: orpd %xmm6, %xmm5
+; SSE-NEXT: addpd %xmm0, %xmm5
+; SSE-NEXT: roundpd $11, %xmm5, %xmm0
+; SSE-NEXT: movapd %xmm1, %xmm5
+; SSE-NEXT: andpd %xmm4, %xmm5
+; SSE-NEXT: orpd %xmm6, %xmm5
+; SSE-NEXT: addpd %xmm1, %xmm5
+; SSE-NEXT: roundpd $11, %xmm5, %xmm1
+; SSE-NEXT: movapd %xmm2, %xmm5
+; SSE-NEXT: andpd %xmm4, %xmm5
+; SSE-NEXT: orpd %xmm6, %xmm5
+; SSE-NEXT: addpd %xmm2, %xmm5
+; SSE-NEXT: roundpd $11, %xmm5, %xmm2
+; SSE-NEXT: andpd %xmm3, %xmm4
+; SSE-NEXT: orpd %xmm6, %xmm4
+; SSE-NEXT: addpd %xmm3, %xmm4
+; SSE-NEXT: roundpd $11, %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_round_v8f64_v4f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3
+; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm2
+; AVX1-NEXT: vorpd %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_round_v8f64_v4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX2-NEXT: vandpd %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX2-NEXT: vorpd %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vaddpd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX2-NEXT: vandpd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vorpd %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vroundpd $11, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_round_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | (zmm0 & m64bcst)
+; AVX512-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x double> @llvm.round.v4f64(<4 x double> %a0)
+ %v1 = call <4 x double> @llvm.round.v4f64(<4 x double> %a1)
+ %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_round_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_round_v16f32_v8f32:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE-NEXT: movaps %xmm0, %xmm5
+; SSE-NEXT: andps %xmm4, %xmm5
+; SSE-NEXT: movaps {{.*#+}} xmm6 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; SSE-NEXT: orps %xmm6, %xmm5
+; SSE-NEXT: addps %xmm0, %xmm5
+; SSE-NEXT: roundps $11, %xmm5, %xmm0
+; SSE-NEXT: movaps %xmm1, %xmm5
+; SSE-NEXT: andps %xmm4, %xmm5
+; SSE-NEXT: orps %xmm6, %xmm5
+; SSE-NEXT: addps %xmm1, %xmm5
+; SSE-NEXT: roundps $11, %xmm5, %xmm1
+; SSE-NEXT: movaps %xmm2, %xmm5
+; SSE-NEXT: andps %xmm4, %xmm5
+; SSE-NEXT: orps %xmm6, %xmm5
+; SSE-NEXT: addps %xmm2, %xmm5
+; SSE-NEXT: roundps $11, %xmm5, %xmm2
+; SSE-NEXT: andps %xmm3, %xmm4
+; SSE-NEXT: orps %xmm6, %xmm4
+; SSE-NEXT: addps %xmm3, %xmm4
+; SSE-NEXT: roundps $11, %xmm4, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_round_v16f32_v8f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3
+; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm2
+; AVX1-NEXT: vorps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vroundps $11, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_round_v16f32_v8f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX2-NEXT: vorps %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vaddps %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vorps %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vroundps $11, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_round_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 | (zmm0 & m32bcst)
+; AVX512-NEXT: vaddps %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <8 x float> @llvm.round.v8f32(<8 x float> %a0)
+ %v1 = call <8 x float> @llvm.round.v8f32(<8 x float> %a1)
+ %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
diff --git a/llvm/test/CodeGen/X86/combine-froundeven.ll b/llvm/test/CodeGen/X86/combine-froundeven.ll
new file mode 100644
index 0000000..4bf1e86
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-froundeven.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_roundeven_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_roundeven_v4f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $8, %xmm0, %xmm0
+; SSE-NEXT: roundpd $8, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_roundeven_v4f64_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $8, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1)
+ %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @concat_roundeven_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_roundeven_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $8, %xmm0, %xmm0
+; SSE-NEXT: roundps $8, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_roundeven_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundps $8, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x double> @concat_roundeven_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_roundeven_v8f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $8, %xmm0, %xmm0
+; SSE-NEXT: roundpd $8, %xmm1, %xmm1
+; SSE-NEXT: roundpd $8, %xmm2, %xmm2
+; SSE-NEXT: roundpd $8, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_roundeven_v8f64_v2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $8, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundpd $8, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_roundeven_v8f64_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $8, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundpd $8, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_roundeven_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $8, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a1)
+ %v2 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a2)
+ %v3 = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %a3)
+ %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundeven_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_roundeven_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $8, %xmm0, %xmm0
+; SSE-NEXT: roundps $8, %xmm1, %xmm1
+; SSE-NEXT: roundps $8, %xmm2, %xmm2
+; SSE-NEXT: roundps $8, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_roundeven_v16f32_v4f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $8, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundps $8, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_roundeven_v16f32_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $8, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundps $8, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_roundeven_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $8, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+define <8 x double> @concat_roundeven_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_roundeven_v8f64_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $8, %xmm0, %xmm0
+; SSE-NEXT: roundpd $8, %xmm1, %xmm1
+; SSE-NEXT: roundpd $8, %xmm2, %xmm2
+; SSE-NEXT: roundpd $8, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v8f64_v4f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundpd $8, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundpd $8, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_roundeven_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $8, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a0)
+ %v1 = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %a1)
+ %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundeven_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_roundeven_v16f32_v8f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $8, %xmm0, %xmm0
+; SSE-NEXT: roundps $8, %xmm1, %xmm1
+; SSE-NEXT: roundps $8, %xmm2, %xmm2
+; SSE-NEXT: roundps $8, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_roundeven_v16f32_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundps $8, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundps $8, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_roundeven_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $8, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a0)
+ %v1 = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %a1)
+ %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-fsqrt.ll b/llvm/test/CodeGen/X86/combine-fsqrt.ll
new file mode 100644
index 0000000..f30eac1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-fsqrt.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_sqrt_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_sqrt_v4f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: sqrtpd %xmm0, %xmm0
+; SSE-NEXT: sqrtpd %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_sqrt_v4f64_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vsqrtpd %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a1)
+ %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @concat_sqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_sqrt_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: sqrtps %xmm0, %xmm0
+; SSE-NEXT: sqrtps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_sqrt_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vsqrtps %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x double> @concat_sqrt_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_sqrt_v8f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: sqrtpd %xmm0, %xmm0
+; SSE-NEXT: sqrtpd %xmm1, %xmm1
+; SSE-NEXT: sqrtpd %xmm2, %xmm2
+; SSE-NEXT: sqrtpd %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_sqrt_v8f64_v2f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT: vsqrtpd %ymm0, %ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT: vsqrtpd %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_sqrt_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vsqrtpd %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a1)
+ %v2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a2)
+ %v3 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a3)
+ %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_sqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_sqrt_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: sqrtps %xmm0, %xmm0
+; SSE-NEXT: sqrtps %xmm1, %xmm1
+; SSE-NEXT: sqrtps %xmm2, %xmm2
+; SSE-NEXT: sqrtps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_sqrt_v16f32_v4f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT: vsqrtps %ymm0, %ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT: vsqrtps %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_sqrt_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vsqrtps %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+define <8 x double> @concat_sqrt_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_sqrt_v8f64_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: sqrtpd %xmm0, %xmm0
+; SSE-NEXT: sqrtpd %xmm1, %xmm1
+; SSE-NEXT: sqrtpd %xmm2, %xmm2
+; SSE-NEXT: sqrtpd %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_sqrt_v8f64_v4f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vsqrtpd %ymm0, %ymm0
+; AVX1OR2-NEXT: vsqrtpd %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_sqrt_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vsqrtpd %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0)
+ %v1 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a1)
+ %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_sqrt_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_sqrt_v16f32_v8f32:
+; SSE: # %bb.0:
+; SSE-NEXT: sqrtps %xmm0, %xmm0
+; SSE-NEXT: sqrtps %xmm1, %xmm1
+; SSE-NEXT: sqrtps %xmm2, %xmm2
+; SSE-NEXT: sqrtps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_sqrt_v16f32_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vsqrtps %ymm0, %ymm0
+; AVX1OR2-NEXT: vsqrtps %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_sqrt_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vsqrtps %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0)
+ %v1 = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a1)
+ %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-ftrunc.ll b/llvm/test/CodeGen/X86/combine-ftrunc.ll
new file mode 100644
index 0000000..3dde226
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-ftrunc.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_trunc_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: concat_trunc_v4f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $11, %xmm0, %xmm0
+; SSE-NEXT: roundpd $11, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_trunc_v4f64_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1)
+ %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @concat_trunc_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_trunc_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $11, %xmm0, %xmm0
+; SSE-NEXT: roundps $11, %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_trunc_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x double> @concat_trunc_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; SSE-LABEL: concat_trunc_v8f64_v2f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $11, %xmm0, %xmm0
+; SSE-NEXT: roundpd $11, %xmm1, %xmm1
+; SSE-NEXT: roundpd $11, %xmm2, %xmm2
+; SSE-NEXT: roundpd $11, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_trunc_v8f64_v2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundpd $11, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_trunc_v8f64_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundpd $11, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_trunc_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a0)
+ %v1 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a1)
+ %v2 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a2)
+ %v3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a3)
+ %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_trunc_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_trunc_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $11, %xmm0, %xmm0
+; SSE-NEXT: roundps $11, %xmm1, %xmm1
+; SSE-NEXT: roundps $11, %xmm2, %xmm2
+; SSE-NEXT: roundps $11, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_trunc_v16f32_v4f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundps $11, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_trunc_v16f32_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundps $11, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_trunc_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+define <8 x double> @concat_trunc_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: concat_trunc_v8f64_v4f64:
+; SSE: # %bb.0:
+; SSE-NEXT: roundpd $11, %xmm0, %xmm0
+; SSE-NEXT: roundpd $11, %xmm1, %xmm1
+; SSE-NEXT: roundpd $11, %xmm2, %xmm2
+; SSE-NEXT: roundpd $11, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_trunc_v8f64_v4f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundpd $11, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundpd $11, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_trunc_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a0)
+ %v1 = call <4 x double> @llvm.trunc.v4f64(<4 x double> %a1)
+ %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_trunc_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: concat_trunc_v16f32_v8f32:
+; SSE: # %bb.0:
+; SSE-NEXT: roundps $11, %xmm0, %xmm0
+; SSE-NEXT: roundps $11, %xmm1, %xmm1
+; SSE-NEXT: roundps $11, %xmm2, %xmm2
+; SSE-NEXT: roundps $11, %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_trunc_v16f32_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundps $11, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundps $11, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_trunc_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a0)
+ %v1 = call <8 x float> @llvm.trunc.v8f32(<8 x float> %a1)
+ %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-icmp.ll b/llvm/test/CodeGen/X86/combine-icmp.ll
new file mode 100644
index 0000000..dba5839
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-icmp.ll
@@ -0,0 +1,905 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE,SSE42
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define i4 @concat_icmp_v4i64_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE2-LABEL: concat_icmp_v4i64_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSE2-NEXT: andps %xmm2, %xmm0
+; SSE2-NEXT: movmskps %xmm0, %eax
+; SSE2-NEXT: xorl $15, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: concat_icmp_v4i64_v2i64:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm2, %xmm2
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm2, %xmm1
+; SSE42-NEXT: packssdw %xmm1, %xmm0
+; SSE42-NEXT: movmskps %xmm0, %eax
+; SSE42-NEXT: xorl $15, %eax
+; SSE42-NEXT: # kill: def $al killed $al killed $eax
+; SSE42-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_icmp_v4i64_v2i64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vmovmskps %xmm0, %eax
+; AVX1OR2-NEXT: xorl $15, %eax
+; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v4i64_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vptestmq %xmm0, %xmm0, %k0
+; AVX512-NEXT: vptestmq %xmm1, %xmm1, %k1
+; AVX512-NEXT: kshiftlb $2, %k1, %k1
+; AVX512-NEXT: korw %k1, %k0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: retq
+ %v0 = icmp ne <2 x i64> %a0, zeroinitializer
+ %v1 = icmp ne <2 x i64> %a1, zeroinitializer
+ %v = shufflevector <2 x i1> %v0, <2 x i1> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r = bitcast <4 x i1> %v to i4
+ ret i4 %r
+}
+
+define i8 @concat_icmp_v8i32_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: concat_icmp_v8i32_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm2, %xmm2
+; SSE-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: packsswb %xmm0, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: def $al killed $al killed $eax
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_icmp_v8i32_v4i32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v8i32_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp eq <4 x i32> %a0, zeroinitializer
+ %v1 = icmp eq <4 x i32> %a1, zeroinitializer
+ %v = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r = bitcast <8 x i1> %v to i8
+ ret i8 %r
+}
+
+define i16 @concat_icmp_v16i16_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE2-LABEL: concat_icmp_v16i16_v8i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: psubusw %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqw %xmm0, %xmm3
+; SSE2-NEXT: psubusw %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqw %xmm0, %xmm2
+; SSE2-NEXT: packsswb %xmm2, %xmm3
+; SSE2-NEXT: pmovmskb %xmm3, %eax
+; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: concat_icmp_v16i16_v8i16:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: pmaxuw %xmm2, %xmm3
+; SSE42-NEXT: pcmpeqw %xmm0, %xmm3
+; SSE42-NEXT: pmaxuw %xmm1, %xmm2
+; SSE42-NEXT: pcmpeqw %xmm1, %xmm2
+; SSE42-NEXT: packsswb %xmm2, %xmm3
+; SSE42-NEXT: pmovmskb %xmm3, %eax
+; SSE42-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: concat_icmp_v16i16_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_icmp_v16i16_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2,2,2,2,2]
+; AVX2-NEXT: vpmaxuw %xmm2, %xmm0, %xmm3
+; AVX2-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpmaxuw %xmm2, %xmm1, %xmm2
+; AVX2-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v16i16_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp ugt <8 x i16> %a0, splat (i16 1)
+ %v1 = icmp ugt <8 x i16> %a1, splat (i16 1)
+ %v = shufflevector <8 x i1> %v0, <8 x i1> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %r = bitcast <16 x i1> %v to i16
+ ret i16 %r
+}
+
+define i32 @concat_icmp_v32i8_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: concat_icmp_v32i8_v16i8:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; SSE-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE-NEXT: pcmpgtb %xmm2, %xmm1
+; SSE-NEXT: pmovmskb %xmm0, %ecx
+; SSE-NEXT: pmovmskb %xmm1, %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_icmp_v32i8_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovmskb %xmm0, %ecx
+; AVX1-NEXT: vpmovmskb %xmm1, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: orl %ecx, %eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_icmp_v32i8_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v32i8_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp sgt <16 x i8> %a0, splat (i8 5)
+ %v1 = icmp sgt <16 x i8> %a1, splat (i8 5)
+ %v = shufflevector <16 x i1> %v0, <16 x i1> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %r = bitcast <32 x i1> %v to i32
+ ret i32 %r
+}
+
+define i8 @concat_icmp_v8i64_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i64> %a3) {
+; SSE2-LABEL: concat_icmp_v8i64_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483776,2147483776,2147483776,2147483648]
+; SSE2-NEXT: movdqa %xmm5, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT: pand %xmm7, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
+; SSE2-NEXT: movdqa %xmm5, %xmm7
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE2-NEXT: pand %xmm7, %xmm2
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm6, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: packsswb %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: concat_icmp_v8i64_v2i64:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT: pxor %xmm4, %xmm0
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775936,9223372036854775936]
+; SSE42-NEXT: movdqa %xmm5, %xmm6
+; SSE42-NEXT: pcmpgtq %xmm0, %xmm6
+; SSE42-NEXT: pxor %xmm4, %xmm1
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
+; SSE42-NEXT: packssdw %xmm0, %xmm6
+; SSE42-NEXT: pxor %xmm4, %xmm2
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: pcmpgtq %xmm2, %xmm0
+; SSE42-NEXT: pxor %xmm4, %xmm3
+; SSE42-NEXT: pcmpgtq %xmm3, %xmm5
+; SSE42-NEXT: packssdw %xmm5, %xmm0
+; SSE42-NEXT: packssdw %xmm6, %xmm6
+; SSE42-NEXT: packssdw %xmm0, %xmm0
+; SSE42-NEXT: packsswb %xmm0, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,3]
+; SSE42-NEXT: pmovmskb %xmm0, %eax
+; SSE42-NEXT: # kill: def $al killed $al killed $eax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: concat_icmp_v8i64_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm4 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854775936,9223372036854775936]
+; AVX1-NEXT: # xmm5 = mem[0,0]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_icmp_v8i64_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854775936,9223372036854775936]
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm1
+; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm2
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v8i64_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpltuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp ult <2 x i64> %a0, splat (i64 128)
+ %v1 = icmp ult <2 x i64> %a1, splat (i64 128)
+ %v2 = icmp ult <2 x i64> %a2, splat (i64 128)
+ %v3 = icmp ult <2 x i64> %a3, splat (i64 128)
+ %v01 = shufflevector <2 x i1> %v0, <2 x i1> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %v23 = shufflevector <2 x i1> %v2, <2 x i1> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %v = shufflevector <4 x i1> %v01, <4 x i1> %v23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r = bitcast <8 x i1> %v to i8
+ ret i8 %r
+}
+
+define i16 @concat_icmp_v16i32_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
+; SSE-LABEL: concat_icmp_v16i32_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE-NEXT: pcmpgtd %xmm4, %xmm1
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: pcmpgtd %xmm4, %xmm2
+; SSE-NEXT: pcmpgtd %xmm4, %xmm3
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packsswb %xmm2, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %eax
+; SSE-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_icmp_v16i32_v4i32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1OR2-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpcmpgtd %xmm4, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpcmpgtd %xmm4, %xmm2, %xmm1
+; AVX1OR2-NEXT: vpcmpgtd %xmm4, %xmm3, %xmm2
+; AVX1OR2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1OR2-NEXT: vpmovmskb %xmm0, %eax
+; AVX1OR2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v16i32_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtd %zmm1, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp sgt <4 x i32> %a0, zeroinitializer
+ %v1 = icmp sgt <4 x i32> %a1, zeroinitializer
+ %v2 = icmp sgt <4 x i32> %a2, zeroinitializer
+ %v3 = icmp sgt <4 x i32> %a3, zeroinitializer
+ %v01 = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v23 = shufflevector <4 x i1> %v2, <4 x i1> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %v = shufflevector <8 x i1> %v01, <8 x i1> %v23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %r = bitcast <16 x i1> %v to i16
+ ret i16 %r
+}
+
+define i32 @concat_icmp_v32i16_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; SSE-LABEL: concat_icmp_v32i16_v8i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: pcmpeqw %xmm4, %xmm0
+; SSE-NEXT: pcmpeqw %xmm4, %xmm1
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: pcmpeqw %xmm4, %xmm2
+; SSE-NEXT: pcmpeqw %xmm4, %xmm3
+; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: pmovmskb %xmm0, %ecx
+; SSE-NEXT: xorl $65535, %ecx # imm = 0xFFFF
+; SSE-NEXT: pmovmskb %xmm2, %eax
+; SSE-NEXT: notl %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_icmp_v32i16_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm2
+; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovmskb %xmm0, %ecx
+; AVX1-NEXT: xorl $65535, %ecx # imm = 0xFFFF
+; AVX1-NEXT: vpmovmskb %xmm1, %eax
+; AVX1-NEXT: notl %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: orl %ecx, %eax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_icmp_v32i16_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpacksswb %ymm2, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v32i16_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vptestmw %zmm0, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp ne <8 x i16> %a0, zeroinitializer
+ %v1 = icmp ne <8 x i16> %a1, zeroinitializer
+ %v2 = icmp ne <8 x i16> %a2, zeroinitializer
+ %v3 = icmp ne <8 x i16> %a3, zeroinitializer
+ %v01 = shufflevector <8 x i1> %v0, <8 x i1> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %v23 = shufflevector <8 x i1> %v2, <8 x i1> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %v = shufflevector <16 x i1> %v01, <16 x i1> %v23, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %r = bitcast <32 x i1> %v to i32
+ ret i32 %r
+}
+
+define i64 @concat_icmp_v64i8_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 x i8> %a3) {
+; SSE-LABEL: concat_icmp_v64i8_v16i8:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pmaxub %xmm4, %xmm5
+; SSE-NEXT: pcmpeqb %xmm0, %xmm5
+; SSE-NEXT: pmovmskb %xmm5, %eax
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: pmaxub %xmm4, %xmm0
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %ecx
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: orl %eax, %ecx
+; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pmaxub %xmm4, %xmm0
+; SSE-NEXT: pcmpeqb %xmm2, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %edx
+; SSE-NEXT: pmaxub %xmm3, %xmm4
+; SSE-NEXT: pcmpeqb %xmm3, %xmm4
+; SSE-NEXT: pmovmskb %xmm4, %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: shlq $32, %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_icmp_v64i8_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX1-NEXT: vpmaxub %xmm4, %xmm0, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpmaxub %xmm4, %xmm1, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpmaxub %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpmaxub %xmm4, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: vpmovmskb %xmm1, %ecx
+; AVX1-NEXT: shll $16, %ecx
+; AVX1-NEXT: orl %eax, %ecx
+; AVX1-NEXT: vpmovmskb %xmm2, %edx
+; AVX1-NEXT: vpmovmskb %xmm3, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: orl %edx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_icmp_v64i8_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX2-NEXT: vpmaxub %xmm4, %xmm0, %xmm5
+; AVX2-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpmaxub %xmm4, %xmm1, %xmm5
+; AVX2-NEXT: vpcmpeqb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpmaxub %xmm4, %xmm2, %xmm5
+; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpmaxub %xmm4, %xmm3, %xmm4
+; AVX2-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmovmskb %ymm0, %ecx
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm0
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v64i8_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
+; AVX512-NEXT: kmovq %k0, %rax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp ugt <16 x i8> %a0, splat (i8 15)
+ %v1 = icmp ugt <16 x i8> %a1, splat (i8 15)
+ %v2 = icmp ugt <16 x i8> %a2, splat (i8 15)
+ %v3 = icmp ugt <16 x i8> %a3, splat (i8 15)
+ %v01 = shufflevector <16 x i1> %v0, <16 x i1> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %v23 = shufflevector <16 x i1> %v2, <16 x i1> %v3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %v = shufflevector <32 x i1> %v01, <32 x i1> %v23, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+ %r = bitcast <64 x i1> %v to i64
+ ret i64 %r
+}
+
+define i8 @concat_icmp_v8i64_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE2-LABEL: concat_icmp_v8i64_v4i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pxor %xmm4, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,3,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,3,2]
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
+; SSE2-NEXT: andps %xmm1, %xmm2
+; SSE2-NEXT: packssdw %xmm2, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT: packsswb %xmm0, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: # kill: def $al killed $al killed $eax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: concat_icmp_v8i64_v4i64:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pxor %xmm4, %xmm4
+; SSE42-NEXT: pcmpeqq %xmm4, %xmm1
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE42-NEXT: pcmpeqq %xmm4, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE42-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE42-NEXT: pcmpeqq %xmm4, %xmm3
+; SSE42-NEXT: pcmpeqq %xmm4, %xmm2
+; SSE42-NEXT: packssdw %xmm3, %xmm2
+; SSE42-NEXT: packssdw %xmm2, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE42-NEXT: packsswb %xmm0, %xmm0
+; SSE42-NEXT: pmovmskb %xmm0, %eax
+; SSE42-NEXT: # kill: def $al killed $al killed $eax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: concat_icmp_v8i64_v4i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: def $al killed $al killed $eax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_icmp_v8i64_v4i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: def $al killed $al killed $eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v8i64_v4i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $al killed $al killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp eq <4 x i64> %a0, zeroinitializer
+ %v1 = icmp eq <4 x i64> %a1, zeroinitializer
+ %v = shufflevector <4 x i1> %v0, <4 x i1> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r = bitcast <8 x i1> %v to i8
+ ret i8 %r
+}
+
+define i16 @concat_icmp_v16i32_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE2-LABEL: concat_icmp_v16i32_v8i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483649,2147483649,2147483649,2147483649]
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm4, %xmm3
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
+; SSE2-NEXT: pxor %xmm4, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
+; SSE2-NEXT: packsswb %xmm2, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: concat_icmp_v16i32_v8i32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [2,2,2,2]
+; SSE42-NEXT: movdqa %xmm1, %xmm5
+; SSE42-NEXT: pmaxud %xmm4, %xmm5
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm5
+; SSE42-NEXT: movdqa %xmm0, %xmm1
+; SSE42-NEXT: pmaxud %xmm4, %xmm1
+; SSE42-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE42-NEXT: packssdw %xmm5, %xmm1
+; SSE42-NEXT: movdqa %xmm3, %xmm0
+; SSE42-NEXT: pmaxud %xmm4, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE42-NEXT: pmaxud %xmm2, %xmm4
+; SSE42-NEXT: pcmpeqd %xmm2, %xmm4
+; SSE42-NEXT: packssdw %xmm0, %xmm4
+; SSE42-NEXT: packsswb %xmm4, %xmm1
+; SSE42-NEXT: pmovmskb %xmm1, %eax
+; SSE42-NEXT: # kill: def $ax killed $ax killed $eax
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: concat_icmp_v16i32_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [2,2,2,2]
+; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpmaxud %xmm3, %xmm0, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_icmp_v16i32_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2]
+; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpmaxud %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpmovmskb %xmm0, %eax
+; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v16i32_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp ugt <8 x i32> %a0, splat (i32 1)
+ %v1 = icmp ugt <8 x i32> %a1, splat (i32 1)
+ %v = shufflevector <8 x i1> %v0, <8 x i1> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %r = bitcast <16 x i1> %v to i16
+ ret i16 %r
+}
+
+define i32 @concat_icmp_v32i16_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: concat_icmp_v32i16_v16i16:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [5,5,5,5,5,5,5,5]
+; SSE-NEXT: pcmpgtw %xmm4, %xmm1
+; SSE-NEXT: pcmpgtw %xmm4, %xmm0
+; SSE-NEXT: packsswb %xmm1, %xmm0
+; SSE-NEXT: pcmpgtw %xmm4, %xmm3
+; SSE-NEXT: pcmpgtw %xmm4, %xmm2
+; SSE-NEXT: packsswb %xmm3, %xmm2
+; SSE-NEXT: pmovmskb %xmm0, %ecx
+; SSE-NEXT: pmovmskb %xmm2, %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: orl %ecx, %eax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_icmp_v32i16_v16i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,5,5,5,5,5,5,5]
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmovmskb %xmm0, %ecx
+; AVX1-NEXT: vpmovmskb %xmm1, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: orl %ecx, %eax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_icmp_v32i16_v16i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
+; AVX2-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v32i16_v16i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
+; AVX512-NEXT: kmovd %k0, %eax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp sgt <16 x i16> %a0, splat (i16 5)
+ %v1 = icmp sgt <16 x i16> %a1, splat (i16 5)
+ %v = shufflevector <16 x i1> %v0, <16 x i1> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ %r = bitcast <32 x i1> %v to i32
+ ret i32 %r
+}
+
+define i64 @concat_icmp_v64i8_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: concat_icmp_v64i8_v32i8:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: pcmpgtb %xmm0, %xmm5
+; SSE-NEXT: pmovmskb %xmm5, %eax
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %ecx
+; SSE-NEXT: shll $16, %ecx
+; SSE-NEXT: orl %eax, %ecx
+; SSE-NEXT: movdqa %xmm4, %xmm0
+; SSE-NEXT: pcmpgtb %xmm2, %xmm0
+; SSE-NEXT: pmovmskb %xmm0, %edx
+; SSE-NEXT: pcmpgtb %xmm3, %xmm4
+; SSE-NEXT: pmovmskb %xmm4, %eax
+; SSE-NEXT: shll $16, %eax
+; SSE-NEXT: orl %edx, %eax
+; SSE-NEXT: shlq $32, %rax
+; SSE-NEXT: orq %rcx, %rax
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: concat_icmp_v64i8_v32i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT: vpmovmskb %xmm3, %eax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %ecx
+; AVX1-NEXT: shll $16, %ecx
+; AVX1-NEXT: orl %eax, %ecx
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %edx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpmovmskb %xmm0, %eax
+; AVX1-NEXT: shll $16, %eax
+; AVX1-NEXT: orl %edx, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_icmp_v64i8_v32i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX2-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpmovmskb %ymm0, %ecx
+; AVX2-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: vpmovmskb %ymm0, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_icmp_v64i8_v32i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vpcmpltb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k0
+; AVX512-NEXT: kmovq %k0, %rax
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %v0 = icmp slt <32 x i8> %a0, splat (i8 1)
+ %v1 = icmp slt <32 x i8> %a1, splat (i8 1)
+ %v = shufflevector <32 x i1> %v0, <32 x i1> %v1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+ %r = bitcast <64 x i1> %v to i64
+ ret i64 %r
+}
diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll
index 29c41ca..15d187a 100644
--- a/llvm/test/CodeGen/X86/combine-mul.ll
+++ b/llvm/test/CodeGen/X86/combine-mul.ll
@@ -504,7 +504,7 @@ define <16 x i8> @PR35579(<16 x i8> %x) {
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; SSE-NEXT: psllw $8, %xmm1
-; SSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,2,0,4,0,2,0,8,0,2,0,4,0,2,0]
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,2,1,4,1,2,1,8,1,2,1,4,1,2,1]
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/combine-rcp.ll b/llvm/test/CodeGen/X86/combine-rcp.ll
new file mode 100644
index 0000000..4647516
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rcp.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rcp_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rcp_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: rcpps %xmm0, %xmm0
+; SSE-NEXT: rcpps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_rcp_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vrcpps %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+; Ensure we don't convert rcpps to rcp14ps
+define <16 x float> @concat_rcp_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rcp_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: rcpps %xmm0, %xmm0
+; SSE-NEXT: rcpps %xmm1, %xmm1
+; SSE-NEXT: rcpps %xmm2, %xmm2
+; SSE-NEXT: rcpps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_rcp_v16f32_v4f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT: vrcpps %ymm0, %ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT: vrcpps %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_rcp_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vrcpps %ymm0, %ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX512-NEXT: vrcpps %ymm1, %ymm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-rndscale.ll b/llvm/test/CodeGen/X86/combine-rndscale.ll
new file mode 100644
index 0000000..b557dd8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rndscale.ll
@@ -0,0 +1,162 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <4 x double> @concat_roundpd_v4f64_v2f64(<2 x double> %a0, <2 x double> %a1) {
+; AVX-LABEL: concat_roundpd_v4f64_v2f64:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
+ %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4)
+ %res = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %res
+}
+
+define <8 x float> @concat_roundps_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; AVX-LABEL: concat_roundps_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
+ %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x double> @concat_roundpd_v8f64_v2f64(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3) {
+; AVX1-LABEL: concat_roundpd_v8f64_v2f64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundpd $4, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_roundpd_v8f64_v2f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundpd $4, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_roundpd_v8f64_v2f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
+ %v1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a1, i32 4)
+ %v2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a2, i32 4)
+ %v3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a3, i32 4)
+ %r01 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %r23 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %res = shufflevector <4 x double> %r01, <4 x double> %r23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundps_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; AVX1-LABEL: concat_roundps_v16f32_v4f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vroundps $4, %ymm1, %ymm1
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: concat_roundps_v16f32_v4f32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX2-NEXT: vroundps $4, %ymm1, %ymm1
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: concat_roundps_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
+ %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4)
+ %v2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a2, i32 4)
+ %v3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a3, i32 4)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+define <8 x double> @concat_roundpd_v8f64_v4f64(<4 x double> %a0, <4 x double> %a1) {
+; AVX1OR2-LABEL: concat_roundpd_v8f64_v4f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundpd $4, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundpd $4, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_roundpd_v8f64_v4f64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscalepd $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
+ %v1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a1, i32 4)
+ %res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x double> %res
+}
+
+define <16 x float> @concat_roundps_v16f32_v8f32(<8 x float> %a0, <8 x float> %a1) {
+; AVX1OR2-LABEL: concat_roundps_v16f32_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vroundps $4, %ymm0, %ymm0
+; AVX1OR2-NEXT: vroundps $4, %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_roundps_v16f32_v8f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: vrndscaleps $4, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
+ %v1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a1, i32 4)
+ %res = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
+
+; negative test - rounding mode mismatch
+define <8 x float> @concat_roundps_v8f32_v4f32_mismatch(<4 x float> %a0, <4 x float> %a1) {
+; AVX-LABEL: concat_roundps_v8f32_v4f32_mismatch:
+; AVX: # %bb.0:
+; AVX-NEXT: vroundps $0, %xmm0, %xmm0
+; AVX-NEXT: vroundps $4, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 0)
+ %v1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a1, i32 4)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-rsqrt.ll b/llvm/test/CodeGen/X86/combine-rsqrt.ll
new file mode 100644
index 0000000..b373458
--- /dev/null
+++ b/llvm/test/CodeGen/X86/combine-rsqrt.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX,AVX1OR2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
+
+define <8 x float> @concat_rsqrt_v8f32_v4f32(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: concat_rsqrt_v8f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: rsqrtps %xmm0, %xmm0
+; SSE-NEXT: rsqrtps %xmm1, %xmm1
+; SSE-NEXT: retq
+;
+; AVX-LABEL: concat_rsqrt_v8f32_v4f32:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vrsqrtps %ymm0, %ymm0
+; AVX-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+ %res = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+; Ensure we don't convert rsqrtps to rsqrt14ps
+define <16 x float> @concat_rsqrt_v16f32_v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3) {
+; SSE-LABEL: concat_rsqrt_v16f32_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: rsqrtps %xmm0, %xmm0
+; SSE-NEXT: rsqrtps %xmm1, %xmm1
+; SSE-NEXT: rsqrtps %xmm2, %xmm2
+; SSE-NEXT: rsqrtps %xmm3, %xmm3
+; SSE-NEXT: retq
+;
+; AVX1OR2-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1OR2-NEXT: vrsqrtps %ymm0, %ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1OR2-NEXT: vrsqrtps %ymm1, %ymm1
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: concat_rsqrt_v16f32_v4f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vrsqrtps %ymm0, %ymm0
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX512-NEXT: vrsqrtps %ymm1, %ymm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT: retq
+ %v0 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
+ %v1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a1)
+ %v2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a2)
+ %v3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a3)
+ %r01 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %r23 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %res = shufflevector <8 x float> %r01, <8 x float> %r23, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x float> %res
+}
diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll
index 36e374b..e601c57 100644
--- a/llvm/test/CodeGen/X86/combine-sub-usat.ll
+++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll
@@ -112,6 +112,69 @@ define <8 x i16> @combine_zero_v8i16(<8 x i16> %a0) {
ret <8 x i16> %1
}
+; fold (usub_sat x, 1) -> sub(x, zext(x != 0))
+define i32 @combine_dec_i32(i32 %a0) {
+; CHECK-LABEL: combine_dec_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: subl $1, %edi
+; CHECK-NEXT: cmovael %edi, %eax
+; CHECK-NEXT: retq
+ %1 = call i32 @llvm.usub.sat.i32(i32 %a0, i32 1)
+ ret i32 %1
+}
+
+; fold (usub_sat x, 1) -> add(x, sext(x != 0))
+define <4 x i32> @combine_dec_v4i32(<4 x i32> %a0) {
+; SSE2-LABEL: combine_dec_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: combine_dec_v4i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: paddd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE42-LABEL: combine_dec_v4i32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT: paddd %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: combine_dec_v4i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_dec_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: combine_dec_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
+; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+ %1 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a0, <4 x i32> splat (i32 1))
+ ret <4 x i32> %1
+}
+
; fold (usub_sat x, x) -> 0
define i32 @combine_self_i32(i32 %a0) {
; CHECK-LABEL: combine_self_i32:
diff --git a/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll b/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll
index 47331db..b19112c 100644
--- a/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll
+++ b/llvm/test/CodeGen/X86/compress-undef-float-passthrough.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f,avx512vl | FileCheck %s --check-prefix=CHECK
-define void @test_compress_undef_float_passthrough() {
+define void @test_compress_undef_float_passthrough(<4 x double> %a0) {
; CHECK-LABEL: test_compress_undef_float_passthrough:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: movb $5, %al
@@ -12,7 +12,7 @@ define void @test_compress_undef_float_passthrough() {
; CHECK-NEXT: retq
entry: ; preds = %loop.50
%0 = bitcast i4 undef to <4 x i1>
- %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> undef, <4 x double> undef, <4 x i1> <i1 1, i1 0, i1 1, i1 0>)
+ %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %a0, <4 x double> undef, <4 x i1> <i1 1, i1 0, i1 1, i1 0>)
call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %1, <4 x ptr> undef, i32 0, <4 x i1> %0)
ret void
}
diff --git a/llvm/test/CodeGen/X86/dag-combine-counter.ll b/llvm/test/CodeGen/X86/dag-combine-counter.ll
index 4cc3c71b..9b56586 100644
--- a/llvm/test/CodeGen/X86/dag-combine-counter.ll
+++ b/llvm/test/CodeGen/X86/dag-combine-counter.ll
@@ -1,8 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=x86_64-- -debug-counter=dagcombine=0-5 < %s | FileCheck %s
-; REQUIRES: asserts
-
define i32 @test(i32 %x) {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
deleted file mode 100644
index 6bbf3eb..0000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -x86-discriminate-memops < %s | FileCheck %s
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-; return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.prefetch(ptr, i32, i32, i32)
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
- %idxprom = sext i32 %pos1 to i64
- %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom
- %0 = load i32, ptr %arrayidx, align 4
- %idxprom1 = sext i32 %pos2 to i64
- %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1
- %1 = load i32, ptr %arrayidx2, align 4
- %add = add nsw i32 %1, %0, !dbg !15
- ret i32 %add
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-
-
-;CHECK-LABEL: sum:
-;CHECK: # %bb.0:
-;CHECK: .loc 1 1 0 {{.*}} discriminator 2
-;CHECK-NEXT: movl (%rdi,%rax,4), %eax
-;CHECK-NEXT: .loc 1 2 20
-;CHECK-NEXT: addl (%rdi,%rcx,4), %eax
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
deleted file mode 100644
index ca412c5..0000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc -x86-discriminate-memops < %s | FileCheck %s
-; RUN: llc -x86-discriminate-memops -x86-bypass-prefetch-instructions=0 < %s | FileCheck %s -check-prefix=NOBYPASS
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-; return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.prefetch(ptr, i32, i32, i32)
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
- %idxprom = sext i32 %pos1 to i64, !dbg !9
- %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9
- %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10
- %idxprom1 = sext i32 %pos2 to i64, !dbg !14
- %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14
- call void @llvm.prefetch(ptr %arrayidx2, i32 0, i32 3, i32 1)
- %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10
- %add = add nsw i32 %1, %0, !dbg !15
- ret i32 %add, !dbg !16
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!14 = !DILocation(line: 2, column: 22, scope: !7)
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-!16 = !DILocation(line: 2, column: 3, scope: !7)
-
-;CHECK-LABEL: sum:
-;CHECK: # %bb.0:
-;CHECK: prefetcht0 (%rdi,%rax,4)
-;CHECK-NEXT: movl (%rdi,%rax,4), %eax
-;CHECK-NEXT: .loc 1 2 20 discriminator 2 # test.cc:2:20
-;CHECK-NEXT: addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT: .loc 1 2 3 # test.cc:2:3
-
-;NOBYPASS-LABEL: sum:
-;NOBYPASS: # %bb.0:
-;NOBYPASS: prefetcht0 (%rdi,%rax,4)
-;NOBYPASS-NEXT: .loc 1 2 22
-;NOBYPASS-NEXT: movl (%rdi,%rax,4), %eax
-;NOBYPASS-NEXT: .loc 1 2 20 {{.*}} discriminator 2 # test.cc:2:20
-;NOBYPASS-NEXT: addl (%rdi,%rcx,4), %eax
-;NOBYPASS-NEXT: .loc 1 2 3 # test.cc:2:3
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops.ll
deleted file mode 100644
index a8421d9..0000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -x86-discriminate-memops < %s | FileCheck %s
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-; return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
- %idxprom = sext i32 %pos1 to i64, !dbg !9
- %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9
- %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10
- %idxprom1 = sext i32 %pos2 to i64, !dbg !14
- %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14
- %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10
- %add = add nsw i32 %1, %0, !dbg !15
- ret i32 %add, !dbg !16
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!14 = !DILocation(line: 2, column: 22, scope: !7)
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-!16 = !DILocation(line: 2, column: 3, scope: !7)
-
-;CHECK-LABEL: sum:
-;CHECK: # %bb.0:
-;CHECK: movl (%rdi,%rax,4), %eax
-;CHECK-NEXT: .loc 1 2 20 discriminator 2 # test.cc:2:20
-;CHECK-NEXT: addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT: .loc 1 2 3 # test.cc:2:3
diff --git a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
index 3243d95..e2400fb 100644
--- a/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
+++ b/llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll
@@ -106,7 +106,8 @@ define <4 x i32> @eq_or_eq_ult_2_fail_multiuse(<4 x i32> %x) {
; AVX512: # %bb.0:
; AVX512-NEXT: subq $24, %rsp
; AVX512-NEXT: .cfi_def_cfa_offset 32
-; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: callq use.v4.i32@PLT
; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/fmaxnum.ll b/llvm/test/CodeGen/X86/fmaxnum.ll
index 150bef0..6a03628 100644
--- a/llvm/test/CodeGen/X86/fmaxnum.ll
+++ b/llvm/test/CodeGen/X86/fmaxnum.ll
@@ -676,15 +676,44 @@ define float @test_maxnum_neg_inf_nnan(float %x, float %y) nounwind {
; Test SNaN quieting
define float @test_maxnum_snan(float %x) {
-; SSE-LABEL: test_maxnum_snan:
-; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSE-NEXT: retq
+; SSE2-LABEL: test_maxnum_snan:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: cmpunordss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: andps %xmm2, %xmm3
+; SSE2-NEXT: maxss %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm2, %xmm1
+; SSE2-NEXT: orps %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: test_maxnum_snan:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; AVX-NEXT: retq
+; SSE4-LABEL: test_maxnum_snan:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE4-NEXT: maxss %xmm0, %xmm1
+; SSE4-NEXT: cmpunordss %xmm0, %xmm0
+; SSE4-NEXT: blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE4-NEXT: movaps %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX1-LABEL: test_maxnum_snan:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT: vmaxss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: test_maxnum_snan:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX512-NEXT: vmaxss %xmm0, %xmm2, %xmm1
+; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
%r = call float @llvm.maxnum.f32(float 0x7ff4000000000000, float %x)
ret float %r
}
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index aae6cda..e0dea64 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -8,8 +8,10 @@
declare float @llvm.maximumnum.f32(float, float)
declare double @llvm.maximumnum.f64(double, double)
+declare fp128 @llvm.maximumnum.f128(fp128, fp128)
declare float @llvm.minimumnum.f32(float, float)
declare double @llvm.minimumnum.f64(double, double)
+declare fp128 @llvm.minimumnum.f128(fp128, fp128)
declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>)
declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>)
@@ -2569,3 +2571,383 @@ define float @test_fminimumnum_snan(float %x, float %y) {
%1 = tail call float @llvm.minimumnum.f32(float 0x7ff4000000000000, float %y)
ret float %1
}
+
+define fp128 @test_fmaximumnum_fp128(fp128 %x, fp128 %y) nounwind {
+; SSE2-LABEL: test_fmaximumnum_fp128:
+; SSE2: # %bb.0: # %start
+; SSE2-NEXT: subq $40, %rsp
+; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: callq __unordtf2@PLT
+; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: jne .LBB39_2
+; SSE2-NEXT: # %bb.1: # %start
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: .LBB39_2: # %start
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: callq __unordtf2@PLT
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: jne .LBB39_4
+; SSE2-NEXT: # %bb.3: # %start
+; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: .LBB39_4: # %start
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT: callq __gttf2@PLT
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: jg .LBB39_6
+; SSE2-NEXT: # %bb.5: # %start
+; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: .LBB39_6: # %start
+; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT: callq __trunctfsf2@PLT
+; SSE2-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: je .LBB39_8
+; SSE2-NEXT: # %bb.7: # %start
+; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: .LBB39_8: # %start
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: callq __eqtf2@PLT
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: je .LBB39_10
+; SSE2-NEXT: # %bb.9: # %start
+; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: .LBB39_10: # %start
+; SSE2-NEXT: addq $40, %rsp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_fmaximumnum_fp128:
+; AVX: # %bb.0: # %start
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: callq __unordtf2@PLT
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: jne .LBB39_2
+; AVX-NEXT: # %bb.1: # %start
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: .LBB39_2: # %start
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: callq __unordtf2@PLT
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: jne .LBB39_4
+; AVX-NEXT: # %bb.3: # %start
+; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: .LBB39_4: # %start
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: callq __gttf2@PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: vmovdqa %xmm0, %xmm1
+; AVX-NEXT: jg .LBB39_6
+; AVX-NEXT: # %bb.5: # %start
+; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: .LBB39_6: # %start
+; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: callq __trunctfsf2@PLT
+; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: je .LBB39_8
+; AVX-NEXT: # %bb.7: # %start
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: .LBB39_8: # %start
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: callq __eqtf2@PLT
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: je .LBB39_10
+; AVX-NEXT: # %bb.9: # %start
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: .LBB39_10: # %start
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: retq
+;
+; AVX10_2-LABEL: test_fmaximumnum_fp128:
+; AVX10_2: # %bb.0: # %start
+; AVX10_2-NEXT: subq $40, %rsp
+; AVX10_2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT: vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT: callq __unordtf2@PLT
+; AVX10_2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: testl %eax, %eax
+; AVX10_2-NEXT: vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT: jne .LBB39_2
+; AVX10_2-NEXT: # %bb.1: # %start
+; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT: .LBB39_2: # %start
+; AVX10_2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT: vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT: callq __unordtf2@PLT
+; AVX10_2-NEXT: testl %eax, %eax
+; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT: jne .LBB39_4
+; AVX10_2-NEXT: # %bb.3: # %start
+; AVX10_2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT: .LBB39_4: # %start
+; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT: callq __gttf2@PLT
+; AVX10_2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: testl %eax, %eax
+; AVX10_2-NEXT: vmovdqa %xmm0, %xmm1
+; AVX10_2-NEXT: jg .LBB39_6
+; AVX10_2-NEXT: # %bb.5: # %start
+; AVX10_2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT: .LBB39_6: # %start
+; AVX10_2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT: callq __trunctfsf2@PLT
+; AVX10_2-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX10_2-NEXT: vmovd %xmm0, %eax
+; AVX10_2-NEXT: testl %eax, %eax
+; AVX10_2-NEXT: je .LBB39_8
+; AVX10_2-NEXT: # %bb.7: # %start
+; AVX10_2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT: .LBB39_8: # %start
+; AVX10_2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT: vmovaps %xmm2, %xmm0
+; AVX10_2-NEXT: callq __eqtf2@PLT
+; AVX10_2-NEXT: testl %eax, %eax
+; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: je .LBB39_10
+; AVX10_2-NEXT: # %bb.9: # %start
+; AVX10_2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: .LBB39_10: # %start
+; AVX10_2-NEXT: addq $40, %rsp
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fmaximumnum_fp128:
+; X86: # %bb.0: # %start
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: vmovups 24(%ebp), %ymm0
+; X86-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: vzeroupper
+; X86-NEXT: calll fmaximum_numl
+; X86-NEXT: subl $4, %esp
+; X86-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: vmovaps %xmm0, (%esi)
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: leal -4(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+start:
+ %0 = tail call fp128 @llvm.maximumnum.f128(fp128 %x, fp128 %y)
+ ret fp128 %0
+}
+
+define fp128 @test_fminimumnum_fp128(fp128 %x, fp128 %y) nounwind {
+; SSE2-LABEL: test_fminimumnum_fp128:
+; SSE2: # %bb.0: # %start
+; SSE2-NEXT: subq $40, %rsp
+; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: callq __unordtf2@PLT
+; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: jne .LBB40_2
+; SSE2-NEXT: # %bb.1: # %start
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: .LBB40_2: # %start
+; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: callq __unordtf2@PLT
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT: jne .LBB40_4
+; SSE2-NEXT: # %bb.3: # %start
+; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: .LBB40_4: # %start
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT: callq __lttf2@PLT
+; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: js .LBB40_6
+; SSE2-NEXT: # %bb.5: # %start
+; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: .LBB40_6: # %start
+; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT: callq __trunctfsf2@PLT
+; SSE2-NEXT: movaps (%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movd %xmm0, %eax
+; SSE2-NEXT: negl %eax
+; SSE2-NEXT: jo .LBB40_8
+; SSE2-NEXT: # %bb.7: # %start
+; SSE2-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT: .LBB40_8: # %start
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: callq __eqtf2@PLT
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT: je .LBB40_10
+; SSE2-NEXT: # %bb.9: # %start
+; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: .LBB40_10: # %start
+; SSE2-NEXT: addq $40, %rsp
+; SSE2-NEXT: retq
+;
+; AVX-LABEL: test_fminimumnum_fp128:
+; AVX: # %bb.0: # %start
+; AVX-NEXT: subq $40, %rsp
+; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: callq __unordtf2@PLT
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: jne .LBB40_2
+; AVX-NEXT: # %bb.1: # %start
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: .LBB40_2: # %start
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, %xmm1
+; AVX-NEXT: callq __unordtf2@PLT
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: jne .LBB40_4
+; AVX-NEXT: # %bb.3: # %start
+; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: .LBB40_4: # %start
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: callq __lttf2@PLT
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: vmovdqa %xmm0, %xmm1
+; AVX-NEXT: js .LBB40_6
+; AVX-NEXT: # %bb.5: # %start
+; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: .LBB40_6: # %start
+; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: callq __trunctfsf2@PLT
+; AVX-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: negl %eax
+; AVX-NEXT: jo .LBB40_8
+; AVX-NEXT: # %bb.7: # %start
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: .LBB40_8: # %start
+; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vmovaps %xmm2, %xmm0
+; AVX-NEXT: callq __eqtf2@PLT
+; AVX-NEXT: testl %eax, %eax
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: je .LBB40_10
+; AVX-NEXT: # %bb.9: # %start
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: .LBB40_10: # %start
+; AVX-NEXT: addq $40, %rsp
+; AVX-NEXT: retq
+;
+; AVX10_2-LABEL: test_fminimumnum_fp128:
+; AVX10_2: # %bb.0: # %start
+; AVX10_2-NEXT: subq $40, %rsp
+; AVX10_2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT: vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT: callq __unordtf2@PLT
+; AVX10_2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: testl %eax, %eax
+; AVX10_2-NEXT: vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT: jne .LBB40_2
+; AVX10_2-NEXT: # %bb.1: # %start
+; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT: .LBB40_2: # %start
+; AVX10_2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT: vmovaps %xmm0, %xmm1
+; AVX10_2-NEXT: callq __unordtf2@PLT
+; AVX10_2-NEXT: testl %eax, %eax
+; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT: jne .LBB40_4
+; AVX10_2-NEXT: # %bb.3: # %start
+; AVX10_2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT: .LBB40_4: # %start
+; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT: callq __lttf2@PLT
+; AVX10_2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: testl %eax, %eax
+; AVX10_2-NEXT: vmovdqa %xmm0, %xmm1
+; AVX10_2-NEXT: js .LBB40_6
+; AVX10_2-NEXT: # %bb.5: # %start
+; AVX10_2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX10_2-NEXT: .LBB40_6: # %start
+; AVX10_2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX10_2-NEXT: callq __trunctfsf2@PLT
+; AVX10_2-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
+; AVX10_2-NEXT: vmovd %xmm0, %eax
+; AVX10_2-NEXT: negl %eax
+; AVX10_2-NEXT: jo .LBB40_8
+; AVX10_2-NEXT: # %bb.7: # %start
+; AVX10_2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX10_2-NEXT: .LBB40_8: # %start
+; AVX10_2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX10_2-NEXT: vmovaps %xmm2, %xmm0
+; AVX10_2-NEXT: callq __eqtf2@PLT
+; AVX10_2-NEXT: testl %eax, %eax
+; AVX10_2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: je .LBB40_10
+; AVX10_2-NEXT: # %bb.9: # %start
+; AVX10_2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX10_2-NEXT: .LBB40_10: # %start
+; AVX10_2-NEXT: addq $40, %rsp
+; AVX10_2-NEXT: retq
+;
+; X86-LABEL: test_fminimumnum_fp128:
+; X86: # %bb.0: # %start
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: vmovups 24(%ebp), %ymm0
+; X86-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, (%esp)
+; X86-NEXT: vzeroupper
+; X86-NEXT: calll fminimum_numl
+; X86-NEXT: subl $4, %esp
+; X86-NEXT: vmovaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT: vmovaps %xmm0, (%esi)
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: leal -4(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl $4
+start:
+ %0 = tail call fp128 @llvm.minimumnum.f128(fp128 %x, fp128 %y)
+ ret fp128 %0
+}
diff --git a/llvm/test/CodeGen/X86/fminnum.ll b/llvm/test/CodeGen/X86/fminnum.ll
index 4aa1a61..5c882c9 100644
--- a/llvm/test/CodeGen/X86/fminnum.ll
+++ b/llvm/test/CodeGen/X86/fminnum.ll
@@ -676,15 +676,44 @@ define float @test_minnum_inf_nnan(float %x, float %y) nounwind {
; Test SNaN quieting
define float @test_minnum_snan(float %x) {
-; SSE-LABEL: test_minnum_snan:
-; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSE-NEXT: retq
+; SSE2-LABEL: test_minnum_snan:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT: movaps %xmm0, %xmm1
+; SSE2-NEXT: cmpunordss %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm3
+; SSE2-NEXT: andps %xmm2, %xmm3
+; SSE2-NEXT: minss %xmm0, %xmm2
+; SSE2-NEXT: andnps %xmm2, %xmm1
+; SSE2-NEXT: orps %xmm3, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: test_minnum_snan:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; AVX-NEXT: retq
+; SSE4-LABEL: test_minnum_snan:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE4-NEXT: minss %xmm0, %xmm1
+; SSE4-NEXT: cmpunordss %xmm0, %xmm0
+; SSE4-NEXT: blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE4-NEXT: movaps %xmm1, %xmm0
+; SSE4-NEXT: retq
+;
+; AVX1-LABEL: test_minnum_snan:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT: vminss %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX512-LABEL: test_minnum_snan:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; AVX512-NEXT: vminss %xmm0, %xmm2, %xmm1
+; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1
+; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1}
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: retq
%r = call float @llvm.minnum.f32(float 0x7ff4000000000000, float %x)
ret float %r
}
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 81529af..b655bda 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -79,38 +79,54 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
; CHECK-SSE-NEXT: retq
;
-; CHECK-AVX-LABEL: fmul_pow2_ldexp_4xfloat:
-; CHECK-AVX: # %bb.0:
-; CHECK-AVX-NEXT: subq $40, %rsp
-; CHECK-AVX-NEXT: .cfi_def_cfa_offset 48
-; CHECK-AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX-NEXT: vextractps $1, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vmovd %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vextractps $2, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
-; CHECK-AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-AVX-NEXT: vextractps $3, %xmm0, %edi
-; CHECK-AVX-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX-NEXT: callq ldexpf@PLT
-; CHECK-AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
-; CHECK-AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; CHECK-AVX-NEXT: addq $40, %rsp
-; CHECK-AVX-NEXT: .cfi_def_cfa_offset 8
-; CHECK-AVX-NEXT: retq
+; CHECK-AVX2-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-AVX2: # %bb.0:
+; CHECK-AVX2-NEXT: subq $40, %rsp
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48
+; CHECK-AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-AVX2-NEXT: vextractps $1, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vmovd %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; CHECK-AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vextractps $2, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; CHECK-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-AVX2-NEXT: vextractps $3, %xmm0, %edi
+; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-AVX2-NEXT: callq ldexpf@PLT
+; CHECK-AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; CHECK-AVX2-NEXT: addq $40, %rsp
+; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8
+; CHECK-AVX2-NEXT: retq
+;
+; CHECK-ONLY-AVX512F-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-ONLY-AVX512F: # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
+; CHECK-ONLY-AVX512F-NEXT: vmovaps %xmm0, %xmm0
+; CHECK-ONLY-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0
+; CHECK-ONLY-AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-ONLY-AVX512F-NEXT: vzeroupper
+; CHECK-ONLY-AVX512F-NEXT: retq
+;
+; CHECK-SKX-LABEL: fmul_pow2_ldexp_4xfloat:
+; CHECK-SKX: # %bb.0:
+; CHECK-SKX-NEXT: vcvtdq2ps %xmm0, %xmm0
+; CHECK-SKX-NEXT: vbroadcastss {{.*#+}} xmm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0]
+; CHECK-SKX-NEXT: vscalefps %xmm0, %xmm1, %xmm0
+; CHECK-SKX-NEXT: retq
%r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> <float 9.000000e+00, float 9.000000e+00, float 9.000000e+00, float 9.000000e+00>, <4 x i32> %i)
ret <4 x float> %r
}
@@ -562,79 +578,11 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
;
; CHECK-AVX512F-LABEL: fmul_pow2_ldexp_8xhalf:
; CHECK-AVX512F: # %bb.0:
-; CHECK-AVX512F-NEXT: subq $72, %rsp
-; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 80
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vpextrw $7, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $6, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $5, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $4, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $3, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $2, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vpextrw $1, %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-AVX512F-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-AVX512F-NEXT: vmovd %xmm0, %eax
-; CHECK-AVX512F-NEXT: movswl %ax, %edi
-; CHECK-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
-; CHECK-AVX512F-NEXT: callq ldexpf@PLT
-; CHECK-AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; CHECK-AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-AVX512F-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-AVX512F-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; CHECK-AVX512F-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-AVX512F-NEXT: addq $72, %rsp
-; CHECK-AVX512F-NEXT: .cfi_def_cfa_offset 8
+; CHECK-AVX512F-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3,8.192E+3]
+; CHECK-AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0
+; CHECK-AVX512F-NEXT: vscalefps %zmm0, %zmm1, %zmm0
+; CHECK-AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; CHECK-AVX512F-NEXT: vzeroupper
; CHECK-AVX512F-NEXT: retq
%r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> <half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000, half 0xH7000>, <8 x i16> %i)
ret <8 x half> %r
@@ -1141,8 +1089,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
; CHECK-AVX2-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-AVX2: # %bb.0:
; CHECK-AVX2-NEXT: subq $56, %rsp
+; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
; CHECK-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-AVX2-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax
@@ -1171,8 +1119,8 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
;
; CHECK-ONLY-AVX512F-LABEL: fmul_pow_shl_cnt_vec_fail_to_large:
; CHECK-ONLY-AVX512F: # %bb.0:
+; CHECK-ONLY-AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2]
; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-ONLY-AVX512F-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,0,0,0,0,0,0]
; CHECK-ONLY-AVX512F-NEXT: vpsllvd %ymm0, %ymm1, %ymm0
; CHECK-ONLY-AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; CHECK-ONLY-AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
diff --git a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
index c1beb7c..c9c88f7 100644
--- a/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
+++ b/llvm/test/CodeGen/X86/fold-loop-of-urem.ll
@@ -1031,31 +1031,30 @@ define void @simple_urem_fail_intermediate_inc(i32 %N, i32 %rem_amt) nounwind {
; CHECK-NEXT: testl %edi, %edi
; CHECK-NEXT: je .LBB17_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %r14d
; CHECK-NEXT: negl %r14d
-; CHECK-NEXT: movl $1, %r15d
+; CHECK-NEXT: movl $1, %ebp
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB17_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl %r15d, %eax
+; CHECK-NEXT: movl %ebp, %eax
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ebx
; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: callq use.i32@PLT
-; CHECK-NEXT: leal 1(%r14,%r15), %eax
-; CHECK-NEXT: movl %r15d, %ecx
-; CHECK-NEXT: incl %ecx
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: incl %ebp
+; CHECK-NEXT: leal 1(%r14,%rax), %eax
; CHECK-NEXT: cmpl $1, %eax
-; CHECK-NEXT: movl %ecx, %r15d
; CHECK-NEXT: jne .LBB17_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
-; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .LBB17_4: # %for.cond.cleanup
; CHECK-NEXT: retq
entry:
@@ -1199,32 +1198,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add(i32 %N, i32 %rem_amt_
; CHECK-NEXT: cmpl $3, %edi
; CHECK-NEXT: jb .LBB21_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %r14d
; CHECK-NEXT: orl $16, %ebx
; CHECK-NEXT: negl %r14d
-; CHECK-NEXT: movl $7, %r15d
+; CHECK-NEXT: movl $7, %ebp
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB21_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl %r15d, %eax
+; CHECK-NEXT: movl %ebp, %eax
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ebx
; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: callq use.i32@PLT
-; CHECK-NEXT: leal 1(%r14,%r15), %eax
-; CHECK-NEXT: movl %r15d, %ecx
-; CHECK-NEXT: incl %ecx
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: incl %ebp
+; CHECK-NEXT: leal 1(%r14,%rax), %eax
; CHECK-NEXT: cmpl $5, %eax
-; CHECK-NEXT: movl %ecx, %r15d
; CHECK-NEXT: jne .LBB21_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
-; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .LBB21_4: # %for.cond.cleanup
; CHECK-NEXT: retq
entry:
@@ -1251,32 +1249,31 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_missing_nuw(i32
; CHECK-NEXT: cmpl $3, %edi
; CHECK-NEXT: jb .LBB22_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %r14d
; CHECK-NEXT: orl $16, %ebx
; CHECK-NEXT: negl %r14d
-; CHECK-NEXT: movl $7, %r15d
+; CHECK-NEXT: movl $7, %ebp
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB22_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl %r15d, %eax
+; CHECK-NEXT: movl %ebp, %eax
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ebx
; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: callq use.i32@PLT
-; CHECK-NEXT: leal 1(%r14,%r15), %eax
-; CHECK-NEXT: movl %r15d, %ecx
-; CHECK-NEXT: incl %ecx
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: incl %ebp
+; CHECK-NEXT: leal 1(%r14,%rax), %eax
; CHECK-NEXT: cmpl $5, %eax
-; CHECK-NEXT: movl %ecx, %r15d
; CHECK-NEXT: jne .LBB22_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
-; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .LBB22_4: # %for.cond.cleanup
; CHECK-NEXT: retq
entry:
@@ -1303,31 +1300,30 @@ define void @simple_urem_to_sel_non_zero_start_through_add_fail_no_simplify_rem(
; CHECK-NEXT: cmpl $3, %edi
; CHECK-NEXT: jb .LBB23_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movl %esi, %ebx
; CHECK-NEXT: movl %edi, %r14d
; CHECK-NEXT: negl %r14d
-; CHECK-NEXT: movl $7, %r15d
+; CHECK-NEXT: movl $7, %ebp
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB23_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl %r15d, %eax
+; CHECK-NEXT: movl %ebp, %eax
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: divl %ebx
; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: callq use.i32@PLT
-; CHECK-NEXT: leal 1(%r14,%r15), %eax
-; CHECK-NEXT: movl %r15d, %ecx
-; CHECK-NEXT: incl %ecx
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: incl %ebp
+; CHECK-NEXT: leal 1(%r14,%rax), %eax
; CHECK-NEXT: cmpl $5, %eax
-; CHECK-NEXT: movl %ecx, %r15d
; CHECK-NEXT: jne .LBB23_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
-; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .LBB23_4: # %for.cond.cleanup
; CHECK-NEXT: retq
entry:
@@ -1404,32 +1400,31 @@ define void @simple_urem_to_sel_non_zero_start_through_sub_no_simplfy(i32 %N, i3
; CHECK-NEXT: cmpl %edx, %edi
; CHECK-NEXT: jbe .LBB25_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
-; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %rbx
-; CHECK-NEXT: movl %edx, %r15d
-; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %edx, %ebx
+; CHECK-NEXT: movl %esi, %ebp
; CHECK-NEXT: movl %edi, %r14d
; CHECK-NEXT: negl %r14d
-; CHECK-NEXT: addl $-2, %r15d
+; CHECK-NEXT: addl $-2, %ebx
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB25_2: # %for.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movl %r15d, %eax
+; CHECK-NEXT: movl %ebx, %eax
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: divl %ebx
+; CHECK-NEXT: divl %ebp
; CHECK-NEXT: movl %edx, %edi
; CHECK-NEXT: callq use.i32@PLT
-; CHECK-NEXT: leal 1(%r14,%r15), %eax
-; CHECK-NEXT: movl %r15d, %ecx
-; CHECK-NEXT: incl %ecx
+; CHECK-NEXT: movl %ebx, %eax
+; CHECK-NEXT: incl %ebx
+; CHECK-NEXT: leal 1(%r14,%rax), %eax
; CHECK-NEXT: cmpl $-2, %eax
-; CHECK-NEXT: movl %ecx, %r15d
; CHECK-NEXT: jne .LBB25_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r14
-; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .LBB25_4: # %for.cond.cleanup
; CHECK-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index e223765..46b2571 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -490,20 +490,21 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind {
define i32 @freeze_ashr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
; X86-LABEL: freeze_ashr_exact_extra_use:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: sarl $3, %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sarl $3, %eax
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: sarl $6, %eax
+; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: retl
;
; X64-LABEL: freeze_ashr_exact_extra_use:
; X64: # %bb.0:
-; X64-NEXT: sarl $3, %edi
-; X64-NEXT: movl %edi, (%rsi)
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: sarl $3, %eax
+; X64-NEXT: movl %eax, %ecx
; X64-NEXT: sarl $6, %eax
+; X64-NEXT: movl %ecx, (%rsi)
; X64-NEXT: retq
%x = ashr exact i32 %a0, 3
%y = freeze i32 %x
@@ -604,20 +605,21 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind {
define i32 @freeze_lshr_exact_extra_use(i32 %a0, ptr %escape) nounwind {
; X86-LABEL: freeze_lshr_exact_extra_use:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shrl $3, %ecx
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: shrl $5, %eax
+; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: retl
;
; X64-LABEL: freeze_lshr_exact_extra_use:
; X64: # %bb.0:
-; X64-NEXT: shrl $3, %edi
-; X64-NEXT: movl %edi, (%rsi)
; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrl $5, %eax
+; X64-NEXT: movl %ecx, (%rsi)
; X64-NEXT: retq
%x = lshr exact i32 %a0, 3
%y = freeze i32 %x
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index feac3dc..638d884 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -388,7 +388,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind {
; GFNISSE-NEXT: movdqa %xmm0, %xmm1
; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNISSE-NEXT: psllw $8, %xmm1
-; GFNISSE-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNISSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNISSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; GFNISSE-NEXT: por %xmm1, %xmm0
; GFNISSE-NEXT: retq
@@ -397,7 +397,7 @@ define <16 x i8> @constant_shl_v16i8(<16 x i8> %a) nounwind {
; GFNIAVX1: # %bb.0:
; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; GFNIAVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; GFNIAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; GFNIAVX1-NEXT: retq
@@ -1213,21 +1213,20 @@ define <32 x i8> @splatvar_ashr_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNISSE-LABEL: constant_shl_v32i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNISSE-NEXT: movdqa %xmm0, %xmm3
-; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
+; GFNISSE-NEXT: pmullw %xmm2, %xmm3
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; GFNISSE-NEXT: pand %xmm4, %xmm3
; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm0
; GFNISSE-NEXT: psllw $8, %xmm0
; GFNISSE-NEXT: por %xmm3, %xmm0
-; GFNISSE-NEXT: movdqa %xmm1, %xmm3
-; GFNISSE-NEXT: pmaddubsw %xmm2, %xmm3
-; GFNISSE-NEXT: pand %xmm4, %xmm3
+; GFNISSE-NEXT: pmullw %xmm1, %xmm2
+; GFNISSE-NEXT: pand %xmm4, %xmm2
; GFNISSE-NEXT: pmaddubsw %xmm5, %xmm1
; GFNISSE-NEXT: psllw $8, %xmm1
-; GFNISSE-NEXT: por %xmm3, %xmm1
+; GFNISSE-NEXT: por %xmm2, %xmm1
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: constant_shl_v32i8:
@@ -1239,9 +1238,9 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1
; GFNIAVX1-NEXT: vpsllw $8, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
-; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3
-; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3
+; GFNIAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; GFNIAVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; GFNIAVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1251,14 +1250,14 @@ define <32 x i8> @constant_shl_v32i8(<32 x i8> %a) nounwind {
; GFNIAVX2: # %bb.0:
; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; GFNIAVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; GFNIAVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; GFNIAVX2-NEXT: retq
;
; GFNIAVX512VL-LABEL: constant_shl_v32i8:
; GFNIAVX512VL: # %bb.0:
-; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX512VL-NEXT: vpsllw $8, %ymm0, %ymm0
; GFNIAVX512VL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
@@ -1684,15 +1683,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_shl_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -1876,15 +1874,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_lshr_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -2232,36 +2230,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_ashr_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; GFNIAVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -2542,9 +2520,9 @@ define <64 x i8> @splatvar_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNISSE-LABEL: constant_shl_v64i8:
; GFNISSE: # %bb.0:
-; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm4 = [1,4,16,64,128,32,8,2]
+; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNISSE-NEXT: movdqa %xmm0, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmullw %xmm4, %xmm6
; GFNISSE-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; GFNISSE-NEXT: pand %xmm5, %xmm6
; GFNISSE-NEXT: movdqa {{.*#+}} xmm7 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
@@ -2552,23 +2530,22 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNISSE-NEXT: psllw $8, %xmm0
; GFNISSE-NEXT: por %xmm6, %xmm0
; GFNISSE-NEXT: movdqa %xmm1, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmullw %xmm4, %xmm6
; GFNISSE-NEXT: pand %xmm5, %xmm6
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm1
; GFNISSE-NEXT: psllw $8, %xmm1
; GFNISSE-NEXT: por %xmm6, %xmm1
; GFNISSE-NEXT: movdqa %xmm2, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
+; GFNISSE-NEXT: pmullw %xmm4, %xmm6
; GFNISSE-NEXT: pand %xmm5, %xmm6
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm2
; GFNISSE-NEXT: psllw $8, %xmm2
; GFNISSE-NEXT: por %xmm6, %xmm2
-; GFNISSE-NEXT: movdqa %xmm3, %xmm6
-; GFNISSE-NEXT: pmaddubsw %xmm4, %xmm6
-; GFNISSE-NEXT: pand %xmm5, %xmm6
+; GFNISSE-NEXT: pmullw %xmm3, %xmm4
+; GFNISSE-NEXT: pand %xmm5, %xmm4
; GFNISSE-NEXT: pmaddubsw %xmm7, %xmm3
; GFNISSE-NEXT: psllw $8, %xmm3
-; GFNISSE-NEXT: por %xmm6, %xmm3
+; GFNISSE-NEXT: por %xmm4, %xmm3
; GFNISSE-NEXT: retq
;
; GFNIAVX1-LABEL: constant_shl_v64i8:
@@ -2580,9 +2557,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm4, %xmm5
; GFNIAVX1-NEXT: vpsllw $8, %xmm5, %xmm5
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; GFNIAVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = [1,4,16,64,128,32,8,2]
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm4, %xmm4
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm4, %xmm4
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; GFNIAVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
; GFNIAVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
@@ -2593,8 +2570,8 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX1-NEXT: vpmaddubsw %xmm2, %xmm6, %xmm2
; GFNIAVX1-NEXT: vpsllw $8, %xmm2, %xmm2
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm6, %xmm3
-; GFNIAVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm6, %xmm3
+; GFNIAVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; GFNIAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; GFNIAVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; GFNIAVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
@@ -2602,9 +2579,9 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
;
; GFNIAVX2-LABEL: constant_shl_v64i8:
; GFNIAVX2: # %bb.0:
-; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX2-NEXT: # ymm2 = mem[0,1,0,1]
-; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3
; GFNIAVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; GFNIAVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; GFNIAVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
@@ -2612,7 +2589,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpsllw $8, %ymm0, %ymm0
; GFNIAVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
-; GFNIAVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2
+; GFNIAVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2
; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; GFNIAVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1
; GFNIAVX2-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -2622,10 +2599,10 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
; GFNIAVX512VL-LABEL: constant_shl_v64i8:
; GFNIAVX512VL: # %bb.0:
; GFNIAVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX512VL-NEXT: # ymm2 = mem[0,1,0,1]
-; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3
-; GFNIAVX512VL-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2
+; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; GFNIAVX512VL-NEXT: vpmullw %ymm2, %ymm0, %ymm2
; GFNIAVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; GFNIAVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX512VL-NEXT: # ymm3 = mem[0,1,0,1]
@@ -2639,7 +2616,7 @@ define <64 x i8> @constant_shl_v64i8(<64 x i8> %a) nounwind {
;
; GFNIAVX512BW-LABEL: constant_shl_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; GFNIAVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; GFNIAVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; GFNIAVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/haddsubsat.ll b/llvm/test/CodeGen/X86/haddsubsat.ll
new file mode 100644
index 0000000..588f338
--- /dev/null
+++ b/llvm/test/CodeGen/X86/haddsubsat.ll
@@ -0,0 +1,101 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=AVX2
+
+define <8 x i16> @phaddsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v8i16_intrinsic:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phaddsw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX2-LABEL: phaddsw_v8i16_intrinsic:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a, <8 x i16> %b)
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @phaddsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v8i16_generic:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phaddsw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX2-LABEL: phaddsw_v8i16_generic:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %sum = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
+ ret <8 x i16> %sum
+}
+
+define <16 x i16> @phaddsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
+; SSSE3-LABEL: phaddsw_v16i16_generic:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phaddsw %xmm1, %xmm0
+; SSSE3-NEXT: phaddsw %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX2-LABEL: phaddsw_v16i16_generic:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vphaddsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+ %even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ %sum = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
+ ret <16 x i16> %sum
+}
+
+define <8 x i16> @phsubsw_v8i16_intrinsic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v8i16_intrinsic:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phsubsw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX2-LABEL: phsubsw_v8i16_intrinsic:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a, <8 x i16> %b)
+ ret <8 x i16> %res
+}
+
+define <8 x i16> @phsubsw_v8i16_generic(<8 x i16> %a, <8 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v8i16_generic:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phsubsw %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; AVX2-LABEL: phsubsw_v8i16_generic:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+ %even = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %odd = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %diff = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %even, <8 x i16> %odd)
+ ret <8 x i16> %diff
+}
+
+define <16 x i16> @phsubsw_v16i16_generic(<16 x i16> %a, <16 x i16> %b) {
+; SSSE3-LABEL: phsubsw_v16i16_generic:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: phsubsw %xmm1, %xmm0
+; SSSE3-NEXT: phsubsw %xmm3, %xmm2
+; SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-NEXT: retq
+;
+; AVX2-LABEL: phsubsw_v16i16_generic:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vphsubsw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: retq
+ %even = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ %odd = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ %diff = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %even, <16 x i16> %odd)
+ ret <16 x i16> %diff
+}
diff --git a/llvm/test/CodeGen/X86/icmp-abs-C.ll b/llvm/test/CodeGen/X86/icmp-abs-C.ll
index 53b70fa..c98889b 100644
--- a/llvm/test/CodeGen/X86/icmp-abs-C.ll
+++ b/llvm/test/CodeGen/X86/icmp-abs-C.ll
@@ -161,22 +161,22 @@ define i16 @ne_and_with_dom_abs(i16 %x) nounwind {
; X86-LABEL: ne_and_with_dom_abs:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movswl %cx, %eax
-; X86-NEXT: sarl $15, %eax
-; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: subl %eax, %ecx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movswl %ax, %ecx
+; X86-NEXT: sarl $15, %ecx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: subl %ecx, %eax
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: xorl $12312, %eax # imm = 0x3018
; X86-NEXT: movzwl %ax, %esi
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: cmpw $64, %cx
-; X86-NEXT: setne %cl
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: cmpw $64, %dx
+; X86-NEXT: setne %dl
; X86-NEXT: cmpl $2345, %esi # imm = 0x929
; X86-NEXT: jae .LBB3_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: movb %cl, %dl
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movb %dl, %cl
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: .LBB3_2:
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
deleted file mode 100644
index 935b707..0000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
+++ /dev/null
@@ -1,4 +0,0 @@
-caller:0:0
- 2: sum:0
- 3: 0 __prefetch_nta_0:23456
- 3.1: 0 __prefetch_nta_0:8764 __prefetch_nta_1:64 \ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll b/llvm/test/CodeGen/X86/insert-prefetch-inline.ll
deleted file mode 100644
index 05f5427..0000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-inline.afdo | FileCheck %s
-;
-; Verify we can insert prefetch instructions in code belonging to inlined
-; functions.
-;
-; ModuleID = 'test.cc'
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind readonly uwtable
-define dso_local i32 @sum(ptr nocapture readonly %arr, i32 %pos1, i32 %pos2) local_unnamed_addr #0 !dbg !7 {
-entry:
- %idxprom = sext i32 %pos1 to i64, !dbg !10
- %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !10
- %0 = load i32, ptr %arrayidx, align 4, !dbg !10, !tbaa !11
- %idxprom1 = sext i32 %pos2 to i64, !dbg !15
- %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !15
- %1 = load i32, ptr %arrayidx2, align 4, !dbg !15, !tbaa !11
- %add = add nsw i32 %1, %0, !dbg !16
- ret i32 %add, !dbg !17
-}
-
-; "caller" inlines "sum". The associated .afdo file references instructions
-; in "caller" that came from "sum"'s inlining.
-;
-; Function Attrs: norecurse nounwind readonly uwtable
-define dso_local i32 @caller(ptr nocapture readonly %arr) local_unnamed_addr #0 !dbg !18 {
-entry:
- %0 = load i32, ptr %arr, align 4, !dbg !19, !tbaa !11
- %arrayidx2.i = getelementptr inbounds i32, ptr %arr, i64 2, !dbg !21
- %1 = load i32, ptr %arrayidx2.i, align 4, !dbg !21, !tbaa !11
- %add.i = add nsw i32 %1, %0, !dbg !22
- ret i32 %add.i, !dbg !23
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !8, file: !8, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DIFile(filename: "./test.h", directory: "/tmp")
-!9 = !DISubroutineType(types: !2)
-!10 = !DILocation(line: 6, column: 10, scope: !7)
-!11 = !{!12, !12, i64 0}
-!12 = !{!"int", !13, i64 0}
-!13 = !{!"omnipotent char", !14, i64 0}
-!14 = !{!"Simple C++ TBAA"}
-!15 = !DILocation(line: 6, column: 22, scope: !7)
-!16 = !DILocation(line: 6, column: 20, scope: !7)
-!17 = !DILocation(line: 6, column: 3, scope: !7)
-!18 = distinct !DISubprogram(name: "caller", linkageName: "caller", scope: !1, file: !1, line: 4, type: !9, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!19 = !DILocation(line: 6, column: 10, scope: !7, inlinedAt: !20)
-!20 = distinct !DILocation(line: 6, column: 10, scope: !18)
-!21 = !DILocation(line: 6, column: 22, scope: !7, inlinedAt: !20)
-!22 = !DILocation(line: 6, column: 20, scope: !7, inlinedAt: !20)
-!23 = !DILocation(line: 6, column: 3, scope: !18)
-
-; CHECK-LABEL: caller:
-; CHECK-LABEL: # %bb.0:
-; CHECK-NEXT: .loc 1 6 22 prologue_end
-; CHECK-NEXT: prefetchnta 23464(%rdi)
-; CHECK-NEXT: movl 8(%rdi), %eax
-; CHECK-NEXT: .loc 1 6 20 is_stmt 0 discriminator 2
-; CHECK-NEXT: prefetchnta 8764(%rdi)
-; CHECK-NEXT: prefetchnta 64(%rdi)
-; CHECK-NEXT: addl (%rdi), %eax
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
deleted file mode 100644
index 6385a49..0000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
+++ /dev/null
@@ -1,2 +0,0 @@
-main:0:0
- 6: 0 __prefetch_nta_0:42 \ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
deleted file mode 100644
index f8e2502..0000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-invalid-instr.afdo | FileCheck %s
-; ModuleID = 'prefetch.cc'
-source_filename = "prefetch.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
-entry:
- tail call void @llvm.prefetch(ptr inttoptr (i64 291 to ptr), i32 0, i32 0, i32 1), !dbg !9
- ret i32 291, !dbg !11
-}
-
-; Function Attrs: inaccessiblemem_or_argmemonly nounwind
-declare void @llvm.prefetch(ptr nocapture readonly, i32, i32, i32) #1
-
-attributes #0 = {"target-cpu"="x86-64" "target-features"="+sse4.2,+ssse3"}
-attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
-attributes #2 = { argmemonly nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "prefetch.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 327078) (llvm/trunk 327086)"}
-!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 12, column: 3, scope: !7)
-!10 = !DILocation(line: 14, column: 3, scope: !7)
-!11 = !DILocation(line: 15, column: 3, scope: !7)
-
-;CHECK-LABEL: main:
-;CHECK: # %bb.0:
-;CHECK: prefetchnta 291
-;CHECK-NOT: prefetchnta 42(%rax,%ymm0)
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo b/llvm/test/CodeGen/X86/insert-prefetch-other.afdo
deleted file mode 100644
index 783da34..0000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo
+++ /dev/null
@@ -1,3 +0,0 @@
-sum:0:0
- 1: 0 __prefetch_t0_1:0 __prefetch_t2_0:42
- 1.1: 0 __prefetch_t1_0:18446744073709551615
diff --git a/llvm/test/CodeGen/X86/insert-prefetch.afdo b/llvm/test/CodeGen/X86/insert-prefetch.afdo
deleted file mode 100644
index 96487e85..0000000
--- a/llvm/test/CodeGen/X86/insert-prefetch.afdo
+++ /dev/null
@@ -1,3 +0,0 @@
-sum:0:0
- 1: 0 __prefetch_nta_1:0 __prefetch_nta_0:42
- 1.1: 0 __prefetch_nta_0:18446744073709551615
diff --git a/llvm/test/CodeGen/X86/insert-prefetch.ll b/llvm/test/CodeGen/X86/insert-prefetch.ll
deleted file mode 100644
index 971a619..0000000
--- a/llvm/test/CodeGen/X86/insert-prefetch.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch.afdo | FileCheck %s
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-other.afdo | FileCheck %s -check-prefix=OTHERS
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-; return arr[pos1] + arr[pos2];
-; }
-;
-; NOTE: debug line numbers were adjusted such that the function would start
-; at line 15 (an arbitrary number). The sample profile file format uses
-; offsets from the start of the symbol instead of file-relative line numbers.
-; The .afdo file reflects that - the instructions are offset '1'.
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !35 !prof !37 {
-entry:
- %idxprom = sext i32 %pos1 to i64, !dbg !38
- %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !38
- %0 = load i32, ptr %arrayidx, align 4, !dbg !38, !tbaa !39
- %idxprom1 = sext i32 %pos2 to i64, !dbg !43
- %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !43
- %1 = load i32, ptr %arrayidx2, align 4, !dbg !43, !tbaa !39
- %add = add nsw i32 %1, %0, !dbg !44
- ret i32 %add, !dbg !45
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5, !6}
-!llvm.ident = !{!33}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{i32 1, !"ProfileSummary", !7}
-!7 = !{!8, !9, !10, !11, !12, !13, !14, !15}
-!8 = !{!"ProfileFormat", !"SampleProfile"}
-!9 = !{!"TotalCount", i64 0}
-!10 = !{!"MaxCount", i64 0}
-!11 = !{!"MaxInternalCount", i64 0}
-!12 = !{!"MaxFunctionCount", i64 0}
-!13 = !{!"NumCounts", i64 2}
-!14 = !{!"NumFunctions", i64 1}
-!15 = !{!"DetailedSummary", !16}
-!16 = !{!17, !18, !19, !20, !21, !22, !22, !23, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
-!17 = !{i32 10000, i64 0, i32 0}
-!18 = !{i32 100000, i64 0, i32 0}
-!19 = !{i32 200000, i64 0, i32 0}
-!20 = !{i32 300000, i64 0, i32 0}
-!21 = !{i32 400000, i64 0, i32 0}
-!22 = !{i32 500000, i64 0, i32 0}
-!23 = !{i32 600000, i64 0, i32 0}
-!24 = !{i32 700000, i64 0, i32 0}
-!25 = !{i32 800000, i64 0, i32 0}
-!26 = !{i32 900000, i64 0, i32 0}
-!27 = !{i32 950000, i64 0, i32 0}
-!28 = !{i32 990000, i64 0, i32 0}
-!29 = !{i32 999000, i64 0, i32 0}
-!30 = !{i32 999900, i64 0, i32 0}
-!31 = !{i32 999990, i64 0, i32 0}
-!32 = !{i32 999999, i64 0, i32 0}
-!33 = !{!"clang version 7.0.0 (trunk 322593) (llvm/trunk 322526)"}
-!35 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 15, type: !36, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!36 = !DISubroutineType(types: !2)
-!37 = !{!"function_entry_count", i64 -1}
-!38 = !DILocation(line: 16, column: 10, scope: !35)
-!39 = !{!40, !40, i64 0}
-!40 = !{!"int", !41, i64 0}
-!41 = !{!"omnipotent char", !42, i64 0}
-!42 = !{!"Simple C++ TBAA"}
-!43 = !DILocation(line: 16, column: 22, scope: !35)
-!44 = !DILocation(line: 16, column: 20, scope: !35)
-!45 = !DILocation(line: 16, column: 3, scope: !35)
-
-;CHECK-LABEL: sum:
-;CHECK: # %bb.0:
-;CHECK: prefetchnta 42(%rdi,%rax,4)
-;CHECK-NEXT: prefetchnta (%rdi,%rax,4)
-;CHECK-NEXT: movl (%rdi,%rax,4), %eax
-;CHECK-NEXT: .loc 1 16 20 discriminator 2 # test.cc:16:20
-;CHECK-NEXT: prefetchnta -1(%rdi,%rcx,4)
-;CHECK-NEXT: addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT: .loc 1 16 3 # test.cc:16:3
-
-;OTHERS-LABEL: sum:
-;OTHERS: # %bb.0:
-;OTHERS: prefetcht2 42(%rdi,%rax,4)
-;OTHERS-NEXT: prefetcht0 (%rdi,%rax,4)
-;OTHERS-NEXT: movl (%rdi,%rax,4), %eax
-;OTHERS-NEXT: .loc 1 16 20 discriminator 2 # test.cc:16:20
-;OTHERS-NEXT: prefetcht1 -1(%rdi,%rcx,4)
-;OTHERS-NEXT: addl (%rdi,%rcx,4), %eax
-;OTHERS-NEXT: .loc 1 16 3 # test.cc:16:3
diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
index e73ff79..f270f8f 100644
--- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
@@ -7,7 +7,7 @@
target triple = "x86_64-unknown-unknown"
declare void @bar1()
define preserve_allcc void @foo()#0 {
-; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
call void @bar1()
call void @bar2()
ret void
@@ -15,7 +15,7 @@ define preserve_allcc void @foo()#0 {
declare void @bar2()
define preserve_nonecc void @foo2()#0 {
-; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $tmm0_tmm1 $tmm2_tmm3 $tmm4_tmm5 $tmm6_tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
+; CHECK: foo2 Clobbered Registers: $ah $al $ax $ch $cl $cs $cx $df $dh $di $dih $dil $dl $ds $dx $eax $ecx $edi $edx $eflags $eip $eiz $es $esi $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hax $hcx $hdi $hdx $hip $hsi $hsp $ip $mxcsr $rax $rcx $rdi $rdx $rflags $rip $riz $rsi $rsp $si $sih $sil $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r8 $r9 $r10 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $xmm0 $xmm1 $xmm2 $xmm3 $xmm4 $xmm5 $xmm6 $xmm7 $xmm8 $xmm9 $xmm10 $xmm11 $xmm12 $xmm13 $xmm14 $xmm15 $r8b $r9b $r10b $r11b $r8bh $r9bh $r10bh $r11bh $r8d $r9d $r10d $r11d $r8w $r9w $r10w $r11w $r8wh $r9wh $r10wh $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $r16 $r17 $r18 $r19 $r20 $r21 $r22 $r23 $r24 $r25 $r26 $r27 $r28 $r29 $r30 $r31 $r16b $r17b $r18b $r19b $r20b $r21b $r22b $r23b $r24b $r25b $r26b $r27b $r28b $r29b $r30b $r31b $r16bh $r17bh $r18bh $r19bh $r20bh $r21bh $r22bh $r23bh $r24bh $r25bh $r26bh $r27bh $r28bh $r29bh $r30bh $r31bh $r16d $r17d $r18d $r19d $r20d $r21d $r22d $r23d $r24d $r25d $r26d $r27d $r28d $r29d $r30d $r31d $r16w $r17w $r18w $r19w $r20w $r21w $r22w $r23w $r24w $r25w $r26w $r27w $r28w $r29w $r30w $r31w $r16wh $r17wh $r18wh $r19wh $r20wh $r21wh $r22wh $r23wh $r24wh $r25wh $r26wh $r27wh $r28wh $r29wh $r30wh $r31wh
call void @bar1()
call void @bar2()
ret void
diff --git a/llvm/test/CodeGen/X86/isel-arg-attrs.ll b/llvm/test/CodeGen/X86/isel-arg-attrs.ll
new file mode 100644
index 0000000..3afee76
--- /dev/null
+++ b/llvm/test/CodeGen/X86/isel-arg-attrs.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-linux-gnu | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=X64
+
+; The src array should be in R10 or ECX register due to nest attribute
+define i32 @nest_arg(ptr nest %src) {
+; X86-LABEL: nest_arg:
+; X86: # %bb.0:
+; X86-NEXT: movl 8(%ecx), %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: nest_arg:
+; X64: # %bb.0:
+; X64-NEXT: movl 8(%r10), %eax
+; X64-NEXT: retq
+ %off = getelementptr [3 x i32], ptr %src, i32 0, i32 2
+ %ret = load i32, ptr %off
+ ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/X86/isel-icmp.ll b/llvm/test/CodeGen/X86/isel-icmp.ll
index 8a4d035..065d701 100644
--- a/llvm/test/CodeGen/X86/isel-icmp.ll
+++ b/llvm/test/CodeGen/X86/isel-icmp.ll
@@ -1,11 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefix=SDAG-X64
-; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefix=FAST-X64
-; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X64
-; RUN: llc < %s -mtriple=i686-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=SDAG-X86
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=SDAG-X64
+; Allow fast-isel to fallback to selection dag on x86 for i96 type.
+; RUN: llc < %s -fast-isel -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=FAST-X64
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefixes=GISEL-X64
+; RUN: llc < %s -mtriple=i686-apple-darwin10 | FileCheck %s --check-prefixes=SDAG-X86
; Allow fast-isel to fallback to selection dag on x86
-; RUN: llc < %s -fast-isel -mtriple=i686-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=FAST-X86
-; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10 -verify-machineinstrs | FileCheck %s --check-prefixes=GISEL-X86
+; RUN: llc < %s -fast-isel -mtriple=i686-apple-darwin10 | FileCheck %s --check-prefixes=FAST-X86
+; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=i686-apple-darwin10 | FileCheck %s --check-prefixes=GISEL-X86
define i32 @test_icmp_eq_i8(i8 %a, i8 %b) {
; SDAG-X64-LABEL: test_icmp_eq_i8:
@@ -720,3 +721,168 @@ define i32 @test_icmp_sle_i32(i32 %a, i32 %b) {
%res = zext i1 %r to i32
ret i32 %res
}
+
+; PR167326
+define i32 @test_icmp_sge_i96(i96 %a, i96 %b) nounwind {
+; SDAG-X64-LABEL: test_icmp_sge_i96:
+; SDAG-X64: ## %bb.0:
+; SDAG-X64-NEXT: movslq %ecx, %rax
+; SDAG-X64-NEXT: movslq %esi, %rcx
+; SDAG-X64-NEXT: cmpq %rdx, %rdi
+; SDAG-X64-NEXT: sbbq %rax, %rcx
+; SDAG-X64-NEXT: setge %al
+; SDAG-X64-NEXT: movzbl %al, %eax
+; SDAG-X64-NEXT: retq
+;
+; FAST-X64-LABEL: test_icmp_sge_i96:
+; FAST-X64: ## %bb.0:
+; FAST-X64-NEXT: movslq %ecx, %rax
+; FAST-X64-NEXT: movslq %esi, %rcx
+; FAST-X64-NEXT: cmpq %rdx, %rdi
+; FAST-X64-NEXT: sbbq %rax, %rcx
+; FAST-X64-NEXT: setge %al
+; FAST-X64-NEXT: andb $1, %al
+; FAST-X64-NEXT: movzbl %al, %eax
+; FAST-X64-NEXT: retq
+;
+; GISEL-X64-LABEL: test_icmp_sge_i96:
+; GISEL-X64: ## %bb.0:
+; GISEL-X64-NEXT: movq %rcx, %rax
+; GISEL-X64-NEXT: movq %rdi, %r8
+; GISEL-X64-NEXT: movb $32, %cl
+; GISEL-X64-NEXT: shlq %cl, %r8
+; GISEL-X64-NEXT: shlq %cl, %rsi
+; GISEL-X64-NEXT: shrq %cl, %rdi
+; GISEL-X64-NEXT: orq %rsi, %rdi
+; GISEL-X64-NEXT: shrq %cl, %r8
+; GISEL-X64-NEXT: movq %rdi, %rsi
+; GISEL-X64-NEXT: shlq %cl, %rsi
+; GISEL-X64-NEXT: orq %r8, %rsi
+; GISEL-X64-NEXT: sarq %cl, %rdi
+; GISEL-X64-NEXT: movq %rdx, %rcx
+; GISEL-X64-NEXT: shlq $32, %rcx
+; GISEL-X64-NEXT: shlq $32, %rax
+; GISEL-X64-NEXT: shrq $32, %rdx
+; GISEL-X64-NEXT: orq %rax, %rdx
+; GISEL-X64-NEXT: shrq $32, %rcx
+; GISEL-X64-NEXT: movq %rdx, %rax
+; GISEL-X64-NEXT: shlq $32, %rax
+; GISEL-X64-NEXT: orq %rcx, %rax
+; GISEL-X64-NEXT: sarq $32, %rdx
+; GISEL-X64-NEXT: xorl %ecx, %ecx
+; GISEL-X64-NEXT: cmpq %rax, %rsi
+; GISEL-X64-NEXT: setae %cl
+; GISEL-X64-NEXT: xorl %eax, %eax
+; GISEL-X64-NEXT: xorl %esi, %esi
+; GISEL-X64-NEXT: cmpq %rdx, %rdi
+; GISEL-X64-NEXT: setge %al
+; GISEL-X64-NEXT: sete %sil
+; GISEL-X64-NEXT: testl %esi, %esi
+; GISEL-X64-NEXT: cmovnew %cx, %ax
+; GISEL-X64-NEXT: andl $1, %eax
+; GISEL-X64-NEXT: retq
+;
+; SDAG-X86-LABEL: test_icmp_sge_i96:
+; SDAG-X86: ## %bb.0:
+; SDAG-X86-NEXT: pushl %ebx
+; SDAG-X86-NEXT: pushl %edi
+; SDAG-X86-NEXT: pushl %esi
+; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SDAG-X86-NEXT: movl %eax, %ecx
+; SDAG-X86-NEXT: sarl $31, %ecx
+; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; SDAG-X86-NEXT: movl %edx, %esi
+; SDAG-X86-NEXT: sarl $31, %esi
+; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; SDAG-X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; SDAG-X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
+; SDAG-X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx
+; SDAG-X86-NEXT: sbbl %eax, %edx
+; SDAG-X86-NEXT: sbbl %ecx, %esi
+; SDAG-X86-NEXT: setge %al
+; SDAG-X86-NEXT: movzbl %al, %eax
+; SDAG-X86-NEXT: popl %esi
+; SDAG-X86-NEXT: popl %edi
+; SDAG-X86-NEXT: popl %ebx
+; SDAG-X86-NEXT: retl
+;
+; FAST-X86-LABEL: test_icmp_sge_i96:
+; FAST-X86: ## %bb.0:
+; FAST-X86-NEXT: pushl %ebx
+; FAST-X86-NEXT: pushl %edi
+; FAST-X86-NEXT: pushl %esi
+; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; FAST-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; FAST-X86-NEXT: movl %eax, %edi
+; FAST-X86-NEXT: sarl $31, %edi
+; FAST-X86-NEXT: movl %edx, %ebx
+; FAST-X86-NEXT: sarl $31, %ebx
+; FAST-X86-NEXT: cmpl %ecx, {{[0-9]+}}(%esp)
+; FAST-X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi
+; FAST-X86-NEXT: sbbl %eax, %edx
+; FAST-X86-NEXT: sbbl %edi, %ebx
+; FAST-X86-NEXT: setge %al
+; FAST-X86-NEXT: andb $1, %al
+; FAST-X86-NEXT: movzbl %al, %eax
+; FAST-X86-NEXT: popl %esi
+; FAST-X86-NEXT: popl %edi
+; FAST-X86-NEXT: popl %ebx
+; FAST-X86-NEXT: retl
+;
+; GISEL-X86-LABEL: test_icmp_sge_i96:
+; GISEL-X86: ## %bb.0:
+; GISEL-X86-NEXT: pushl %ebp
+; GISEL-X86-NEXT: pushl %ebx
+; GISEL-X86-NEXT: pushl %edi
+; GISEL-X86-NEXT: pushl %esi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; GISEL-X86-NEXT: movl %edx, %eax
+; GISEL-X86-NEXT: movb $31, %cl
+; GISEL-X86-NEXT: sarl %cl, %eax
+; GISEL-X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT: setae %ch
+; GISEL-X86-NEXT: xorl %ebx, %ebx
+; GISEL-X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp)
+; GISEL-X86-NEXT: setae %cl
+; GISEL-X86-NEXT: sete %bl
+; GISEL-X86-NEXT: testl %ebx, %ebx
+; GISEL-X86-NEXT: je LBB13_2
+; GISEL-X86-NEXT: ## %bb.1:
+; GISEL-X86-NEXT: movb %ch, %cl
+; GISEL-X86-NEXT: LBB13_2:
+; GISEL-X86-NEXT: movl %esi, %edi
+; GISEL-X86-NEXT: sarl $31, %edi
+; GISEL-X86-NEXT: xorl %ebx, %ebx
+; GISEL-X86-NEXT: cmpl %esi, %edx
+; GISEL-X86-NEXT: setae %dl
+; GISEL-X86-NEXT: sete %bl
+; GISEL-X86-NEXT: testl %ebx, %ebx
+; GISEL-X86-NEXT: je LBB13_4
+; GISEL-X86-NEXT: ## %bb.3:
+; GISEL-X86-NEXT: movl %ecx, %edx
+; GISEL-X86-NEXT: LBB13_4:
+; GISEL-X86-NEXT: xorl %ecx, %ecx
+; GISEL-X86-NEXT: cmpl %edi, %eax
+; GISEL-X86-NEXT: setge %al
+; GISEL-X86-NEXT: sete %cl
+; GISEL-X86-NEXT: testl %ecx, %ecx
+; GISEL-X86-NEXT: je LBB13_6
+; GISEL-X86-NEXT: ## %bb.5:
+; GISEL-X86-NEXT: movl %edx, %eax
+; GISEL-X86-NEXT: LBB13_6:
+; GISEL-X86-NEXT: movzbl %al, %eax
+; GISEL-X86-NEXT: andl $1, %eax
+; GISEL-X86-NEXT: popl %esi
+; GISEL-X86-NEXT: popl %edi
+; GISEL-X86-NEXT: popl %ebx
+; GISEL-X86-NEXT: popl %ebp
+; GISEL-X86-NEXT: retl
+ %r = icmp sge i96 %a, %b
+ %res = zext i1 %r to i32
+ ret i32 %res
+}
diff --git a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
index 065710f..8576f8f 100644
--- a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
@@ -3,6 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-SINCOS-STRET
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-NOSINCOS-STRET
+
; TODO: The below RUN line will fails GISEL selection and will fallback to DAG selection due to lack of support for loads/stores in i686 mode, support is expected soon enough, for this reason the llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir test is added for now because of the lack of support for i686 in GlobalISel.
; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86
; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
@@ -34,6 +37,29 @@ define { float, float } @test_sincos_f32(float %Val) nounwind {
; X64-NEXT: popq %rax
; X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %rax
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT: popq %rax
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: popq %rax
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: test_sincos_f32:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: subl $28, %esp
@@ -93,6 +119,28 @@ define { double, double } @test_sincos_f64(double %Val) nounwind {
; X64-NEXT: addq $24, %rsp
; X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %rax
+; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT: popq %rax
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: subq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _sin
+; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT: callq _cos
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT: addq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: test_sincos_f64:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: subl $44, %esp
@@ -153,6 +201,40 @@ define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind {
; X64-NEXT: addq $56, %rsp
; X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp)
+; MACOS-SINCOS-STRET-NEXT: fld %st(0)
+; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT: callq _cosl
+; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT: callq _sinl
+; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: fxch %st(1)
+; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: subq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp)
+; MACOS-NOSINCOS-STRET-NEXT: fld %st(0)
+; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosl
+; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinl
+; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: fxch %st(1)
+; MACOS-NOSINCOS-STRET-NEXT: addq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: test_sincos_f80:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: subl $60, %esp
@@ -288,6 +370,57 @@ define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %
; SDAG-X64-NEXT: popq %r14
; SDAG-X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-SINCOS-STRET: ## %bb.0: ## %entry
+; MACOS-SINCOS-STRET-NEXT: pushq %r14
+; MACOS-SINCOS-STRET-NEXT: pushq %rbx
+; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movq %r14, %rdi
+; MACOS-SINCOS-STRET-NEXT: movq %rbx, %rsi
+; MACOS-SINCOS-STRET-NEXT: callq _foo
+; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%r14)
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%rbx)
+; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: popq %rbx
+; MACOS-SINCOS-STRET-NEXT: popq %r14
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-NOSINCOS-STRET: ## %bb.0: ## %entry
+; MACOS-NOSINCOS-STRET-NEXT: pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movq %r14, %rdi
+; MACOS-NOSINCOS-STRET-NEXT: movq %rbx, %rsi
+; MACOS-NOSINCOS-STRET-NEXT: callq _foo
+; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT: addq $8, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: popq %r14
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: can_fold_with_call_in_chain:
; GISEL-X86: # %bb.0: # %entry
; GISEL-X86-NEXT: pushl %ebx
diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll
index 8b1e69a..5d216a2 100644
--- a/llvm/test/CodeGen/X86/kmov.ll
+++ b/llvm/test/CodeGen/X86/kmov.ll
@@ -477,16 +477,13 @@ define <32 x i1> @invert_i64_mask_extract_32(i64 %mask) {
; X64-AVX512-LABEL: invert_i64_mask_extract_32:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: kmovq %rdi, %k0
-; X64-AVX512-NEXT: knotb %k0, %k1
-; X64-AVX512-NEXT: kshiftrd $8, %k0, %k2
-; X64-AVX512-NEXT: knotb %k2, %k2
-; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1
+; X64-AVX512-NEXT: kshiftrd $8, %k0, %k1
+; X64-AVX512-NEXT: kunpckbw %k0, %k1, %k1
; X64-AVX512-NEXT: kshiftrd $16, %k0, %k2
-; X64-AVX512-NEXT: knotb %k2, %k2
; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0
-; X64-AVX512-NEXT: knotb %k0, %k0
; X64-AVX512-NEXT: kunpckbw %k2, %k0, %k0
; X64-AVX512-NEXT: kunpckwd %k1, %k0, %k0
+; X64-AVX512-NEXT: knotd %k0, %k0
; X64-AVX512-NEXT: vpmovm2b %k0, %ymm0
; X64-AVX512-NEXT: retq
;
@@ -495,18 +492,16 @@ define <32 x i1> @invert_i64_mask_extract_32(i64 %mask) {
; X64-KNL-NEXT: movl %edi, %eax
; X64-KNL-NEXT: shrl $16, %eax
; X64-KNL-NEXT: kmovw %eax, %k0
-; X64-KNL-NEXT: knotw %k0, %k0
; X64-KNL-NEXT: movl %edi, %eax
; X64-KNL-NEXT: shrl $24, %eax
; X64-KNL-NEXT: kmovw %eax, %k1
-; X64-KNL-NEXT: knotw %k1, %k1
-; X64-KNL-NEXT: kunpckbw %k0, %k1, %k1
+; X64-KNL-NEXT: kunpckbw %k0, %k1, %k0
+; X64-KNL-NEXT: knotw %k0, %k1
; X64-KNL-NEXT: kmovw %edi, %k0
-; X64-KNL-NEXT: knotw %k0, %k0
; X64-KNL-NEXT: shrl $8, %edi
; X64-KNL-NEXT: kmovw %edi, %k2
-; X64-KNL-NEXT: knotw %k2, %k2
-; X64-KNL-NEXT: kunpckbw %k0, %k2, %k2
+; X64-KNL-NEXT: kunpckbw %k0, %k2, %k0
+; X64-KNL-NEXT: knotw %k0, %k2
; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0
; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
@@ -586,27 +581,20 @@ define <64 x i1> @invert_i64_mask_extract_64(i64 %mask) {
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: kmovq %rdi, %k0
; X64-AVX512-NEXT: kshiftrq $32, %k0, %k1
-; X64-AVX512-NEXT: knotb %k1, %k1
; X64-AVX512-NEXT: kshiftrq $40, %k0, %k2
-; X64-AVX512-NEXT: knotb %k2, %k2
; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1
; X64-AVX512-NEXT: kshiftrq $48, %k0, %k2
-; X64-AVX512-NEXT: knotb %k2, %k2
; X64-AVX512-NEXT: kshiftrq $56, %k0, %k3
-; X64-AVX512-NEXT: knotb %k3, %k3
; X64-AVX512-NEXT: kunpckbw %k2, %k3, %k2
; X64-AVX512-NEXT: kunpckwd %k1, %k2, %k1
-; X64-AVX512-NEXT: knotb %k0, %k2
-; X64-AVX512-NEXT: kshiftrd $8, %k0, %k3
-; X64-AVX512-NEXT: knotb %k3, %k3
-; X64-AVX512-NEXT: kunpckbw %k2, %k3, %k2
+; X64-AVX512-NEXT: kshiftrd $8, %k0, %k2
+; X64-AVX512-NEXT: kunpckbw %k0, %k2, %k2
; X64-AVX512-NEXT: kshiftrd $16, %k0, %k3
-; X64-AVX512-NEXT: knotb %k3, %k3
; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0
-; X64-AVX512-NEXT: knotb %k0, %k0
; X64-AVX512-NEXT: kunpckbw %k3, %k0, %k0
; X64-AVX512-NEXT: kunpckwd %k2, %k0, %k0
; X64-AVX512-NEXT: kunpckdq %k0, %k1, %k0
+; X64-AVX512-NEXT: knotq %k0, %k0
; X64-AVX512-NEXT: vpmovm2b %k0, %zmm0
; X64-AVX512-NEXT: retq
;
@@ -614,38 +602,34 @@ define <64 x i1> @invert_i64_mask_extract_64(i64 %mask) {
; X64-KNL: # %bb.0:
; X64-KNL-NEXT: movq %rdi, %rax
; X64-KNL-NEXT: kmovw %esi, %k0
-; X64-KNL-NEXT: knotw %k0, %k0
; X64-KNL-NEXT: movl %esi, %ecx
; X64-KNL-NEXT: shrl $8, %ecx
; X64-KNL-NEXT: kmovw %ecx, %k1
-; X64-KNL-NEXT: knotw %k1, %k1
; X64-KNL-NEXT: kunpckbw %k0, %k1, %k0
+; X64-KNL-NEXT: knotw %k0, %k0
; X64-KNL-NEXT: movl %esi, %ecx
; X64-KNL-NEXT: shrl $16, %ecx
; X64-KNL-NEXT: kmovw %ecx, %k1
-; X64-KNL-NEXT: knotw %k1, %k1
; X64-KNL-NEXT: movl %esi, %ecx
; X64-KNL-NEXT: shrl $24, %ecx
; X64-KNL-NEXT: kmovw %ecx, %k2
-; X64-KNL-NEXT: knotw %k2, %k2
; X64-KNL-NEXT: kunpckbw %k1, %k2, %k1
+; X64-KNL-NEXT: knotw %k1, %k1
; X64-KNL-NEXT: movq %rsi, %rcx
; X64-KNL-NEXT: shrq $32, %rcx
; X64-KNL-NEXT: kmovw %ecx, %k2
-; X64-KNL-NEXT: knotw %k2, %k2
; X64-KNL-NEXT: movq %rsi, %rcx
; X64-KNL-NEXT: shrq $40, %rcx
; X64-KNL-NEXT: kmovw %ecx, %k3
-; X64-KNL-NEXT: knotw %k3, %k3
; X64-KNL-NEXT: kunpckbw %k2, %k3, %k2
+; X64-KNL-NEXT: knotw %k2, %k2
; X64-KNL-NEXT: movq %rsi, %rcx
; X64-KNL-NEXT: shrq $48, %rcx
; X64-KNL-NEXT: kmovw %ecx, %k3
-; X64-KNL-NEXT: knotw %k3, %k3
; X64-KNL-NEXT: shrq $56, %rsi
; X64-KNL-NEXT: kmovw %esi, %k4
-; X64-KNL-NEXT: knotw %k4, %k4
; X64-KNL-NEXT: kunpckbw %k3, %k4, %k3
+; X64-KNL-NEXT: knotw %k3, %k3
; X64-KNL-NEXT: kmovw %k3, 6(%rdi)
; X64-KNL-NEXT: kmovw %k2, 4(%rdi)
; X64-KNL-NEXT: kmovw %k1, 2(%rdi)
diff --git a/llvm/test/CodeGen/X86/ldexp-avx512.ll b/llvm/test/CodeGen/X86/ldexp-avx512.ll
new file mode 100644
index 0000000..bb6dc31
--- /dev/null
+++ b/llvm/test/CodeGen/X86/ldexp-avx512.ll
@@ -0,0 +1,312 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512FP16
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VLF
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512fp16 | FileCheck %s --check-prefixes=CHECK,AVX512VLFP16
+
+define half @test_half(half %x, i32 %exp) nounwind {
+; AVX512F-LABEL: test_half:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1
+; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512F-NEXT: vscalefss %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512FP16-LABEL: test_half:
+; AVX512FP16: # %bb.0: # %entry
+; AVX512FP16-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1
+; AVX512FP16-NEXT: vscalefsh %xmm1, %xmm0, %xmm0
+; AVX512FP16-NEXT: retq
+;
+; AVX512VL-LABEL: test_half:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %xmm0
+; AVX512VL-NEXT: vscalefss %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_half:
+; AVX512VLFP16: # %bb.0: # %entry
+; AVX512VLFP16-NEXT: vcvtsi2sh %edi, %xmm31, %xmm1
+; AVX512VLFP16-NEXT: vscalefsh %xmm1, %xmm0, %xmm0
+; AVX512VLFP16-NEXT: retq
+entry:
+ %r = tail call fast half @llvm.ldexp.f16.i32(half %x, i32 %exp)
+ ret half %r
+}
+declare half @llvm.ldexp.f16.i32(half, i32) memory(none)
+
+define float @test_float(float %x, i32 %exp) nounwind {
+; CHECK-LABEL: test_float:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1
+; CHECK-NEXT: vscalefss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %r = tail call fast float @ldexpf(float %x, i32 %exp)
+ ret float %r
+}
+declare float @ldexpf(float, i32) memory(none)
+
+define double @test_double(double %x, i32 %exp) nounwind {
+; CHECK-LABEL: test_double:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vcvtsi2sd %edi, %xmm15, %xmm1
+; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+entry:
+ %r = tail call fast double @ldexp(double %x, i32 %exp)
+ ret double %r
+}
+declare double @ldexp(double, i32) memory(none)
+
+define fp128 @testExpl(fp128 %x, i32 %exp) nounwind {
+; CHECK-LABEL: testExpl:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: jmp ldexpl@PLT # TAILCALL
+entry:
+ %r = tail call fast fp128 @ldexpl(fp128 %x, i32 %exp)
+ ret fp128 %r
+}
+declare fp128 @ldexpl(fp128, i32) memory(none)
+
+define <8 x half> @test_ldexp_8xhalf(<8 x half> %x, <8 x i16> %exp) nounwind {
+; AVX512F-LABEL: test_ldexp_8xhalf:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512FP16-LABEL: test_ldexp_8xhalf:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512FP16-NEXT: vinsertf32x4 $0, %xmm0, %zmm2, %zmm0
+; AVX512FP16-NEXT: vmovaps %xmm1, %xmm1
+; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0
+; AVX512FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512FP16-NEXT: vzeroupper
+; AVX512FP16-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_8xhalf:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovsxwd %xmm1, %ymm1
+; AVX512VL-NEXT: vcvtph2ps %xmm0, %ymm0
+; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vcvtps2ph $4, %ymm0, %xmm0
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_8xhalf:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtw2ph %xmm1, %xmm1
+; AVX512VLFP16-NEXT: vscalefph %xmm1, %xmm0, %xmm0
+; AVX512VLFP16-NEXT: retq
+ %r = call <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half> %x, <8 x i16> %exp)
+ ret <8 x half> %r
+}
+declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
+
+define <4 x float> @test_ldexp_4xfloat(<4 x float> %x, <4 x i32> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_4xfloat:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %xmm1, %xmm1
+; AVX512-NEXT: vmovaps %xmm0, %xmm0
+; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_4xfloat:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtdq2ps %xmm1, %xmm1
+; AVX512VL-NEXT: vscalefps %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_4xfloat:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtdq2ps %xmm1, %xmm1
+; AVX512VLFP16-NEXT: vscalefps %xmm1, %xmm0, %xmm0
+; AVX512VLFP16-NEXT: retq
+ %r = call <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %x, <4 x i32> %exp)
+ ret <4 x float> %r
+}
+declare <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>)
+
+define <2 x double> @test_ldexp_2xdouble(<2 x double> %x, <2 x i32> %exp) nounwind {
+; CHECK-LABEL: test_ldexp_2xdouble:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vcvtdq2pd %xmm1, %xmm2
+; CHECK-NEXT: vscalefsd %xmm2, %xmm0, %xmm2
+; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
+; CHECK-NEXT: vcvtdq2pd %xmm1, %xmm1
+; CHECK-NEXT: vscalefsd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; CHECK-NEXT: retq
+ %r = call <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %x, <2 x i32> %exp)
+ ret <2 x double> %r
+}
+declare <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>)
+
+define <16 x half> @test_ldexp_16xhalf(<16 x half> %x, <16 x i16> %exp) nounwind {
+; AVX512F-LABEL: test_ldexp_16xhalf:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512FP16-LABEL: test_ldexp_16xhalf:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512FP16-NEXT: vinsertf64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512FP16-NEXT: vmovaps %ymm1, %ymm1
+; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0
+; AVX512FP16-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512FP16-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_16xhalf:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_16xhalf:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtw2ph %ymm1, %ymm1
+; AVX512VLFP16-NEXT: vscalefph %ymm1, %ymm0, %ymm0
+; AVX512VLFP16-NEXT: retq
+ %r = call <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half> %x, <16 x i16> %exp)
+ ret <16 x half> %r
+}
+declare <16 x half> @llvm.ldexp.v16f16.v16i16(<16 x half>, <16 x i16>)
+
+define <8 x float> @test_ldexp_8xfloat(<8 x float> %x, <8 x i32> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_8xfloat:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovaps %ymm1, %ymm1
+; AVX512-NEXT: vmovaps %ymm0, %ymm0
+; AVX512-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_8xfloat:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX512VL-NEXT: vscalefps %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_8xfloat:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX512VLFP16-NEXT: vscalefps %ymm1, %ymm0, %ymm0
+; AVX512VLFP16-NEXT: retq
+ %r = call <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %x, <8 x i32> %exp)
+ ret <8 x float> %r
+}
+declare <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>)
+
+define <4 x double> @test_ldexp_4xdouble(<4 x double> %x, <4 x i32> %exp) nounwind {
+; AVX512-LABEL: test_ldexp_4xdouble:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovapd %xmm1, %xmm1
+; AVX512-NEXT: vmovapd %ymm0, %ymm0
+; AVX512-NEXT: vscalefpd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_4xdouble:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vcvtdq2pd %xmm1, %ymm1
+; AVX512VL-NEXT: vscalefpd %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_4xdouble:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtdq2pd %xmm1, %ymm1
+; AVX512VLFP16-NEXT: vscalefpd %ymm1, %ymm0, %ymm0
+; AVX512VLFP16-NEXT: retq
+ %r = call <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %x, <4 x i32> %exp)
+ ret <4 x double> %r
+}
+declare <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>)
+
+define <32 x half> @test_ldexp_32xhalf(<32 x half> %x, <32 x i16> %exp) nounwind {
+; AVX512F-LABEL: test_ldexp_32xhalf:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm2
+; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm3
+; AVX512F-NEXT: vscalefps %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT: vcvtps2ph $4, %zmm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512F-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512F-NEXT: retq
+;
+; AVX512FP16-LABEL: test_ldexp_32xhalf:
+; AVX512FP16: # %bb.0:
+; AVX512FP16-NEXT: vcvtw2ph %zmm1, %zmm1
+; AVX512FP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0
+; AVX512FP16-NEXT: retq
+;
+; AVX512VL-LABEL: test_ldexp_32xhalf:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm2
+; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm3
+; AVX512VL-NEXT: vscalefps %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vcvtps2ph $4, %zmm2, %ymm2
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; AVX512VL-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT: vcvtph2ps %ymm0, %zmm0
+; AVX512VL-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vcvtps2ph $4, %zmm0, %ymm0
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512VLFP16-LABEL: test_ldexp_32xhalf:
+; AVX512VLFP16: # %bb.0:
+; AVX512VLFP16-NEXT: vcvtw2ph %zmm1, %zmm1
+; AVX512VLFP16-NEXT: vscalefph %zmm1, %zmm0, %zmm0
+; AVX512VLFP16-NEXT: retq
+ %r = call <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half> %x, <32 x i16> %exp)
+ ret <32 x half> %r
+}
+declare <32 x half> @llvm.ldexp.v32f16.v32i16(<32 x half>, <32 x i16>)
+
+define <16 x float> @test_ldexp_16xfloat(<16 x float> %x, <16 x i32> %exp) nounwind {
+; CHECK-LABEL: test_ldexp_16xfloat:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vcvtdq2ps %zmm1, %zmm1
+; CHECK-NEXT: vscalefps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %r = call <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %x, <16 x i32> %exp)
+ ret <16 x float> %r
+}
+declare <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>)
+
+define <8 x double> @test_ldexp_8xdouble(<8 x double> %x, <8 x i32> %exp) nounwind {
+; CHECK-LABEL: test_ldexp_8xdouble:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1
+; CHECK-NEXT: vscalefpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %r = call <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %x, <8 x i32> %exp)
+ ret <8 x double> %r
+}
+declare <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>)
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512VLF: {{.*}}
diff --git a/llvm/test/CodeGen/X86/llc-accept-avx10-512.ll b/llvm/test/CodeGen/X86/llc-accept-avx10-512.ll
new file mode 100644
index 0000000..b5c9895
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llc-accept-avx10-512.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+
+; avx10.x-512 is just avx10.x -- 512 is kept for compatibility purposes.
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 2>&1 | FileCheck --check-prefixes=CHECK-AVX10_1 %s
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-512 2>&1 | FileCheck --check-prefixes=CHECK-AVX10_2 %s
+
+; CHECK-AVX10_1-NOT: is not recognizable
+; CHECK-AVX10_2-NOT: is not recognizable
+
+define <32 x bfloat> @foo_avx10.1(<16 x float> %a, <16 x float> %b) {
+; CHECK-AVX10_1-LABEL: foo_avx10.1:
+; CHECK-AVX10_1: # %bb.0:
+; CHECK-AVX10_1-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0
+; CHECK-AVX10_1-NEXT: retq
+;
+; CHECK-AVX10_2-LABEL: foo_avx10.1:
+; CHECK-AVX10_2: # %bb.0:
+; CHECK-AVX10_2-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0
+; CHECK-AVX10_2-NEXT: retq
+ %ret = call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a, <16 x float> %b)
+ ret <32 x bfloat> %ret
+}
+
+define <8 x i32> @foo_avx10.2(<8 x double> %f) {
+; CHECK-AVX10_1-LABEL: foo_avx10.2:
+; CHECK-AVX10_1: # %bb.0:
+; CHECK-AVX10_1-NEXT: vextractf32x4 $2, %zmm0, %xmm1
+; CHECK-AVX10_1-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; CHECK-AVX10_1-NEXT: vmovsd {{.*#+}} xmm3 = [-2.147483648E+9,0.0E+0]
+; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm2, %xmm4
+; CHECK-AVX10_1-NEXT: vmovsd {{.*#+}} xmm5 = [2.147483647E+9,0.0E+0]
+; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4
+; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx
+; CHECK-AVX10_1-NEXT: xorl %eax, %eax
+; CHECK-AVX10_1-NEXT: vucomisd %xmm2, %xmm2
+; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx
+; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm1, %xmm2
+; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm2, %xmm2
+; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm2, %edx
+; CHECK-AVX10_1-NEXT: vucomisd %xmm1, %xmm1
+; CHECK-AVX10_1-NEXT: cmovpl %eax, %edx
+; CHECK-AVX10_1-NEXT: vmovd %edx, %xmm1
+; CHECK-AVX10_1-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1
+; CHECK-AVX10_1-NEXT: vextractf32x4 $3, %zmm0, %xmm2
+; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm2, %xmm4
+; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4
+; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx
+; CHECK-AVX10_1-NEXT: vucomisd %xmm2, %xmm2
+; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx
+; CHECK-AVX10_1-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
+; CHECK-AVX10_1-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm2, %xmm4
+; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4
+; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx
+; CHECK-AVX10_1-NEXT: vucomisd %xmm2, %xmm2
+; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx
+; CHECK-AVX10_1-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
+; CHECK-AVX10_1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm2, %xmm4
+; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4
+; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx
+; CHECK-AVX10_1-NEXT: vucomisd %xmm2, %xmm2
+; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx
+; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm0, %xmm2
+; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm2, %xmm2
+; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm2, %edx
+; CHECK-AVX10_1-NEXT: vucomisd %xmm0, %xmm0
+; CHECK-AVX10_1-NEXT: cmovpl %eax, %edx
+; CHECK-AVX10_1-NEXT: vmovd %edx, %xmm2
+; CHECK-AVX10_1-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2
+; CHECK-AVX10_1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm0, %xmm4
+; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm4, %xmm4
+; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm4, %ecx
+; CHECK-AVX10_1-NEXT: vucomisd %xmm0, %xmm0
+; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx
+; CHECK-AVX10_1-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2
+; CHECK-AVX10_1-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-AVX10_1-NEXT: vmaxsd %xmm3, %xmm0, %xmm3
+; CHECK-AVX10_1-NEXT: vminsd %xmm5, %xmm3, %xmm3
+; CHECK-AVX10_1-NEXT: vcvttsd2si %xmm3, %ecx
+; CHECK-AVX10_1-NEXT: vucomisd %xmm0, %xmm0
+; CHECK-AVX10_1-NEXT: cmovpl %eax, %ecx
+; CHECK-AVX10_1-NEXT: vpinsrd $3, %ecx, %xmm2, %xmm0
+; CHECK-AVX10_1-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-AVX10_1-NEXT: retq
+;
+; CHECK-AVX10_2-LABEL: foo_avx10.2:
+; CHECK-AVX10_2: # %bb.0:
+; CHECK-AVX10_2-NEXT: vcvttpd2dqs %zmm0, %ymm0
+; CHECK-AVX10_2-NEXT: retq
+ %x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> %f)
+ ret <8 x i32> %x
+}
+
diff --git a/llvm/test/CodeGen/X86/llc-fp-contract-warning.ll b/llvm/test/CodeGen/X86/llc-fp-contract-warning.ll
new file mode 100644
index 0000000..2802593
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llc-fp-contract-warning.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=fast 2>&1 | grep "X86 backend ignores --fp-contract"
+
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=off 2>&1 | grep "X86 backend ignores --fp-contract"
+
+; on, as a default setting that's passed to backend when no --fp-contract option is specified, is not diagnosed.
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown --fp-contract=on 2>&1 | grep -v "X86 backend ignores --fp-contract"
+
+define float @foo(float %f) {
+ %res = fadd float %f, %f
+ ret float %res
+}
+
diff --git a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
index 834dd78..9b02438 100644
--- a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
+++ b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
@@ -1,59 +1,213 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 5
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 | FileCheck --check-prefix=MACOS-SINCOS-STRET %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 | FileCheck --check-prefix=MACOS-NOSINCOS-STRET %s
define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v4f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $52, %esp
-; CHECK-NEXT: movl 84(%esp), %esi
-; CHECK-NEXT: flds 76(%esp)
-; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: flds 64(%esp)
-; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: flds 72(%esp)
-; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: flds 68(%esp)
-; CHECK-NEXT: movl 80(%esp), %edi
-; CHECK-NEXT: leal 40(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: leal 4(%edi), %eax
-; CHECK-NEXT: movl %eax, 4(%esp)
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: leal 44(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: leal 8(%edi), %eax
-; CHECK-NEXT: movl %eax, 4(%esp)
-; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: leal 36(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: movl %edi, 4(%esp)
-; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: leal 48(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: addl $12, %edi
-; CHECK-NEXT: movl %edi, 4(%esp)
-; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: flds 36(%esp)
-; CHECK-NEXT: flds 40(%esp)
-; CHECK-NEXT: flds 44(%esp)
-; CHECK-NEXT: flds 48(%esp)
-; CHECK-NEXT: fstps 12(%esi)
-; CHECK-NEXT: fstps 8(%esi)
-; CHECK-NEXT: fstps 4(%esi)
-; CHECK-NEXT: fstps (%esi)
-; CHECK-NEXT: addl $52, %esp
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: retl
+; X86-LABEL: test_sincos_v4f32:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $52, %esp
+; X86-NEXT: movl 84(%esp), %esi
+; X86-NEXT: flds 76(%esp)
+; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: flds 64(%esp)
+; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: flds 72(%esp)
+; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: flds 68(%esp)
+; X86-NEXT: movl 80(%esp), %edi
+; X86-NEXT: leal 40(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: leal 4(%edi), %eax
+; X86-NEXT: movl %eax, 4(%esp)
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: leal 44(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: leal 8(%edi), %eax
+; X86-NEXT: movl %eax, 4(%esp)
+; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: leal 36(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: movl %edi, 4(%esp)
+; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: leal 48(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: addl $12, %edi
+; X86-NEXT: movl %edi, 4(%esp)
+; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: flds 36(%esp)
+; X86-NEXT: flds 40(%esp)
+; X86-NEXT: flds 44(%esp)
+; X86-NEXT: flds 48(%esp)
+; X86-NEXT: fstps 12(%esi)
+; X86-NEXT: fstps 8(%esi)
+; X86-NEXT: fstps 4(%esi)
+; X86-NEXT: fstps (%esi)
+; X86-NEXT: addl $52, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: test_sincos_v4f32:
+; X64: # %bb.0:
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $56, %rsp
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X64-NEXT: leaq 4(%rsp), %rdi
+; X64-NEXT: movq %rsp, %rsi
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: leaq 12(%rsp), %rdi
+; X64-NEXT: leaq 8(%rsp), %rsi
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: leaq 28(%rsp), %rdi
+; X64-NEXT: leaq 24(%rsp), %rsi
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT: leaq 20(%rsp), %rdi
+; X64-NEXT: leaq 16(%rsp), %rsi
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT: movups %xmm1, (%r14)
+; X64-NEXT: movups %xmm0, (%rbx)
+; X64-NEXT: addq $56, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r14
+; X64-NEXT: retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %r14
+; MACOS-SINCOS-STRET-NEXT: pushq %rbx
+; MACOS-SINCOS-STRET-NEXT: subq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, %xmm1
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT: unpcklpd (%rsp), %xmm2 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: ## xmm2 = xmm2[0],mem[0]
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT: addq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT: popq %rbx
+; MACOS-SINCOS-STRET-NEXT: popq %r14
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: subq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT: addq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: popq %r14
+; MACOS-NOSINCOS-STRET-NEXT: retq
%result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x)
%result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
%result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
@@ -63,36 +217,120 @@ define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias
}
define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v2f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $52, %esp
-; CHECK-NEXT: movl 84(%esp), %esi
-; CHECK-NEXT: fldl 72(%esp)
-; CHECK-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; CHECK-NEXT: fldl 64(%esp)
-; CHECK-NEXT: movl 80(%esp), %edi
-; CHECK-NEXT: leal 24(%esp), %eax
-; CHECK-NEXT: movl %eax, 12(%esp)
-; CHECK-NEXT: movl %edi, 8(%esp)
-; CHECK-NEXT: fstpl (%esp)
-; CHECK-NEXT: calll sincos
-; CHECK-NEXT: leal 32(%esp), %eax
-; CHECK-NEXT: movl %eax, 12(%esp)
-; CHECK-NEXT: addl $8, %edi
-; CHECK-NEXT: movl %edi, 8(%esp)
-; CHECK-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
-; CHECK-NEXT: fstpl (%esp)
-; CHECK-NEXT: calll sincos
-; CHECK-NEXT: fldl 24(%esp)
-; CHECK-NEXT: fldl 32(%esp)
-; CHECK-NEXT: fstpl 8(%esi)
-; CHECK-NEXT: fstpl (%esi)
-; CHECK-NEXT: addl $52, %esp
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: retl
+; X86-LABEL: test_sincos_v2f64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $52, %esp
+; X86-NEXT: movl 84(%esp), %esi
+; X86-NEXT: fldl 72(%esp)
+; X86-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; X86-NEXT: fldl 64(%esp)
+; X86-NEXT: movl 80(%esp), %edi
+; X86-NEXT: leal 24(%esp), %eax
+; X86-NEXT: movl %eax, 12(%esp)
+; X86-NEXT: movl %edi, 8(%esp)
+; X86-NEXT: fstpl (%esp)
+; X86-NEXT: calll sincos
+; X86-NEXT: leal 32(%esp), %eax
+; X86-NEXT: movl %eax, 12(%esp)
+; X86-NEXT: addl $8, %edi
+; X86-NEXT: movl %edi, 8(%esp)
+; X86-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT: fstpl (%esp)
+; X86-NEXT: calll sincos
+; X86-NEXT: fldl 24(%esp)
+; X86-NEXT: fldl 32(%esp)
+; X86-NEXT: fstpl 8(%esi)
+; X86-NEXT: fstpl (%esi)
+; X86-NEXT: addl $52, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: test_sincos_v2f64:
+; X64: # %bb.0:
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $56, %rsp
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: leaq 24(%rsp), %rdi
+; X64-NEXT: leaq 16(%rsp), %rsi
+; X64-NEXT: callq sincos@PLT
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: leaq 8(%rsp), %rdi
+; X64-NEXT: movq %rsp, %rsi
+; X64-NEXT: callq sincos@PLT
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; X64-NEXT: movups %xmm1, (%r14)
+; X64-NEXT: movups %xmm0, (%rbx)
+; X64-NEXT: addq $56, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r14
+; X64-NEXT: retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %r14
+; MACOS-SINCOS-STRET-NEXT: pushq %rbx
+; MACOS-SINCOS-STRET-NEXT: subq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT: addq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT: popq %rbx
+; MACOS-SINCOS-STRET-NEXT: popq %r14
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: subq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cos
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cos
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sin
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sin
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT: addq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: popq %r14
+; MACOS-NOSINCOS-STRET-NEXT: retq
%result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x)
%result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
%result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1
diff --git a/llvm/test/CodeGen/X86/llvm.sincospi.ll b/llvm/test/CodeGen/X86/llvm.sincospi.ll
new file mode 100644
index 0000000..5546c66
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llvm.sincospi.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=x86_64-apple-macosx10.9 < %s | FileCheck %s
+
+define { half, half } @test_sincospi_f16(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subq $40, %rsp
+; CHECK-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-NEXT: movzwl %ax, %edi
+; CHECK-NEXT: callq ___extendhfsf2
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq ___truncsfhf2
+; CHECK-NEXT: ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq ___truncsfhf2
+; CHECK-NEXT: ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $0, %eax, %xmm1
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT: addq $40, %rsp
+; CHECK-NEXT: retq
+ %result = call { half, half } @llvm.sincospi.f16(half %a)
+ ret { half, half } %result
+}
+
+define half @test_sincospi_f16_only_use_sin(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_sin:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-NEXT: movzwl %ax, %edi
+; CHECK-NEXT: callq ___extendhfsf2
+; CHECK-NEXT: movq %rsp, %rdi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq ___truncsfhf2
+; CHECK-NEXT: ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %result = call { half, half } @llvm.sincospi.f16(half %a)
+ %result.0 = extractvalue { half, half } %result, 0
+ ret half %result.0
+}
+
+define half @test_sincospi_f16_only_use_cos(half %a) #0 {
+; CHECK-LABEL: test_sincospi_f16_only_use_cos:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-NEXT: movzwl %ax, %edi
+; CHECK-NEXT: callq ___extendhfsf2
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq %rsp, %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq ___truncsfhf2
+; CHECK-NEXT: ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %result = call { half, half } @llvm.sincospi.f16(half %a)
+ %result.1 = extractvalue { half, half } %result, 1
+ ret half %result.1
+}
+
+define { <2 x half>, <2 x half> } @test_sincospi_v2f16(<2 x half> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f16:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: pextrw $0, %xmm0, %ebx
+; CHECK-NEXT: psrld $16, %xmm0
+; CHECK-NEXT: pextrw $0, %xmm0, %eax
+; CHECK-NEXT: movzwl %ax, %edi
+; CHECK-NEXT: callq ___extendhfsf2
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq %rsp, %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movzwl %bx, %edi
+; CHECK-NEXT: callq ___extendhfsf2
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq ___truncsfhf2
+; CHECK-NEXT: ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq ___truncsfhf2
+; CHECK-NEXT: ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq ___truncsfhf2
+; CHECK-NEXT: ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: callq ___truncsfhf2
+; CHECK-NEXT: ## kill: def $ax killed $ax def $eax
+; CHECK-NEXT: pinsrw $0, %eax, %xmm0
+; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; CHECK-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; CHECK-NEXT: ## xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; CHECK-NEXT: addq $64, %rsp
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: retq
+ %result = call { <2 x half>, <2 x half> } @llvm.sincospi.v2f16(<2 x half> %a)
+ ret { <2 x half>, <2 x half> } %result
+}
+
+define { float, float } @test_sincospi_f32(float %a) #0 {
+; CHECK-LABEL: test_sincospi_f32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: pushq %rax
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq %rsp, %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: popq %rax
+; CHECK-NEXT: retq
+ %result = call { float, float } @llvm.sincospi.f32(float %a)
+ ret { float, float } %result
+}
+
+define { <2 x float>, <2 x float> } @test_sincospi_v2f32(<2 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subq $40, %rsp
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq %rsp, %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-NEXT: addq $40, %rsp
+; CHECK-NEXT: retq
+ %result = call { <2 x float>, <2 x float> } @llvm.sincospi.v2f32(<2 x float> %a)
+ ret { <2 x float>, <2 x float> } %result
+}
+
+define { <3 x float>, <3 x float> } @test_sincospi_v3f32(<3 x float> %a) #0 {
+; CHECK-LABEL: test_sincospi_v3f32:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: callq ___sincospif
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: retq
+ %result = call { <3 x float>, <3 x float> } @llvm.sincospi.v3f32(<3 x float> %a)
+ ret { <3 x float>, <3 x float> } %result
+}
+
+define { double, double } @test_sincospi_f64(double %a) #0 {
+; CHECK-LABEL: test_sincospi_f64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subq $24, %rsp
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: callq ___sincospi
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: addq $24, %rsp
+; CHECK-NEXT: retq
+ %result = call { double, double } @llvm.sincospi.f64(double %a)
+ ret { double, double } %result
+}
+
+define { <2 x double>, <2 x double> } @test_sincospi_v2f64(<2 x double> %a) #0 {
+; CHECK-LABEL: test_sincospi_v2f64:
+; CHECK: ## %bb.0:
+; CHECK-NEXT: subq $56, %rsp
+; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: callq ___sincospi
+; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq %rsp, %rsi
+; CHECK-NEXT: callq ___sincospi
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; CHECK-NEXT: addq $56, %rsp
+; CHECK-NEXT: retq
+ %result = call { <2 x double>, <2 x double> } @llvm.sincospi.v2f64(<2 x double> %a)
+ ret { <2 x double>, <2 x double> } %result
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
index ebae51f..0800373 100644
--- a/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
+++ b/llvm/test/CodeGen/X86/loop-strength-reduce5.ll
@@ -16,11 +16,11 @@ define void @foo(i32 %N) nounwind {
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: # %bb
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movw %dx, X
-; CHECK-NEXT: movw %cx, Y
-; CHECK-NEXT: incl %edx
-; CHECK-NEXT: addl $4, %ecx
-; CHECK-NEXT: cmpl %edx, %eax
+; CHECK-NEXT: movw %cx, X
+; CHECK-NEXT: movw %dx, Y
+; CHECK-NEXT: incl %ecx
+; CHECK-NEXT: addl $4, %edx
+; CHECK-NEXT: cmpl %ecx, %eax
; CHECK-NEXT: jne .LBB0_2
; CHECK-NEXT: .LBB0_3: # %return
; CHECK-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index 2a2a4a5..209ee79 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -1480,15 +1480,15 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX2-NEXT: addq $16, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB10_1
; AVX2-NEXT: # %bb.2: # %middle.block
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1728,10 +1728,10 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado
; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm6, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpaddd %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; AVX2-NEXT: vpmulld %ymm7, %ymm4, %ymm4
; AVX2-NEXT: vpaddd %ymm3, %ymm4, %ymm3
@@ -1739,9 +1739,9 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB11_1
; AVX2-NEXT: # %bb.2: # %middle.block
-; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1765,15 +1765,15 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmulld %zmm2, %zmm4, %zmm2
-; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
+; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0
; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
; AVX512-NEXT: vpmulld %zmm3, %zmm2, %zmm2
-; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vpaddd %zmm1, %zmm2, %zmm1
; AVX512-NEXT: addq $16, %rcx
; AVX512-NEXT: cmpq %rcx, %rax
; AVX512-NEXT: jne .LBB11_1
; AVX512-NEXT: # %bb.2: # %middle.block
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index caec02e..58adbb7 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -207,15 +207,15 @@ declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , <
; SCALAR-NEXT: store i32 %Elt2, ptr %Ptr23, align 4
define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
-; X64-LABEL: test6:
-; X64: # %bb.0:
-; X64-NEXT: kxnorw %k0, %k0, %k1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: kxnorw %k0, %k0, %k2
-; X64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
-; X64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
-; X64-NEXT: vmovdqa %ymm2, %ymm0
-; X64-NEXT: retq
+; X64-KNL-LABEL: test6:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-KNL-NEXT: kxnorw %k0, %k0, %k2
+; X64-KNL-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; X64-KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; X64-KNL-NEXT: vmovdqa %ymm2, %ymm0
+; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test6:
; X86-KNL: # %bb.0:
@@ -230,11 +230,21 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
; X86-KNL-NEXT: vmovdqa %ymm2, %ymm0
; X86-KNL-NEXT: retl
;
+; X64-SKX-LABEL: test6:
+; X64-SKX: # %bb.0:
+; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-SKX-NEXT: kxnorb %k0, %k0, %k2
+; X64-SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; X64-SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
+; X64-SKX-NEXT: vmovdqa %ymm2, %ymm0
+; X64-SKX-NEXT: retq
+;
; X86-SKX-LABEL: test6:
; X86-SKX: # %bb.0:
-; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
; X86-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X86-SKX-NEXT: kxnorw %k0, %k0, %k2
+; X86-SKX-NEXT: kxnorb %k0, %k0, %k2
; X86-SKX-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
; X86-SKX-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
; X86-SKX-NEXT: vmovdqa %ymm2, %ymm0
@@ -255,9 +265,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-KNL-NEXT: kmovw %k1, %k2
; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
-; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm2
-; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
-; X64-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0
+; X64-KNL-NEXT: vmovdqa %ymm1, %ymm2
+; X64-KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
+; X64-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0
; X64-KNL-NEXT: retq
;
; X86-KNL-LABEL: test7:
@@ -271,9 +281,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X86-KNL-NEXT: kmovw %k1, %k2
; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
-; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm2
-; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
-; X86-KNL-NEXT: vpaddd %ymm2, %ymm1, %ymm0
+; X86-KNL-NEXT: vmovdqa %ymm1, %ymm2
+; X86-KNL-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1}
+; X86-KNL-NEXT: vpaddd %ymm1, %ymm2, %ymm0
; X86-KNL-NEXT: retl
;
; X64-SKX-LABEL: test7:
@@ -397,7 +407,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; X64-SKX-SMALL-NEXT: retq
@@ -412,7 +422,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; X64-SKX-LARGE-NEXT: retq
@@ -424,7 +434,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1}
; X86-SKX-NEXT: retl
@@ -481,7 +491,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; X64-SKX-SMALL-NEXT: retq
@@ -496,7 +506,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
-; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
; X64-SKX-LARGE-NEXT: retq
@@ -508,7 +518,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1
-; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1}
; X86-SKX-NEXT: retl
@@ -2465,17 +2475,17 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
declare <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x ptr>)
define <16 x ptr> @test31(<16 x ptr> %ptrs) {
-; X64-LABEL: test31:
-; X64: # %bb.0:
-; X64-NEXT: kxnorw %k0, %k0, %k1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; X64-NEXT: kxnorw %k0, %k0, %k2
-; X64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
-; X64-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
-; X64-NEXT: vmovdqa64 %zmm3, %zmm0
-; X64-NEXT: vmovdqa64 %zmm2, %zmm1
-; X64-NEXT: retq
+; X64-KNL-LABEL: test31:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; X64-KNL-NEXT: kxnorw %k0, %k0, %k2
+; X64-KNL-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
+; X64-KNL-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
+; X64-KNL-NEXT: vmovdqa64 %zmm3, %zmm0
+; X64-KNL-NEXT: vmovdqa64 %zmm2, %zmm1
+; X64-KNL-NEXT: retq
;
; X86-LABEL: test31:
; X86: # %bb.0:
@@ -2484,6 +2494,18 @@ define <16 x ptr> @test31(<16 x ptr> %ptrs) {
; X86-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
; X86-NEXT: vmovdqa64 %zmm1, %zmm0
; X86-NEXT: retl
+;
+; X64-SKX-LABEL: test31:
+; X64-SKX: # %bb.0:
+; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-SKX-NEXT: kxnorb %k0, %k0, %k2
+; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; X64-SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
+; X64-SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
+; X64-SKX-NEXT: vmovdqa64 %zmm3, %zmm0
+; X64-SKX-NEXT: vmovdqa64 %zmm2, %zmm1
+; X64-SKX-NEXT: retq
%res = call <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x ptr> undef)
ret <16 x ptr>%res
}
@@ -3253,17 +3275,17 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0
; X64-KNL-NEXT: retq
;
-; X86-LABEL: test_global_array:
-; X86: # %bb.0:
-; X86-NEXT: kxnorw %k0, %k0, %k1
-; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
-; X86-NEXT: vmovdqa %ymm1, %ymm0
-; X86-NEXT: retl
+; X86-KNL-LABEL: test_global_array:
+; X86-KNL: # %bb.0:
+; X86-KNL-NEXT: kxnorw %k0, %k0, %k1
+; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0
+; X86-KNL-NEXT: retl
;
; X64-SKX-SMALL-LABEL: test_global_array:
; X64-SKX-SMALL: # %bb.0:
-; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0
@@ -3272,11 +3294,19 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
; X64-SKX-LARGE-LABEL: test_global_array:
; X64-SKX-LARGE: # %bb.0:
; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax
-; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0
; X64-SKX-LARGE-NEXT: retq
+;
+; X86-SKX-LABEL: test_global_array:
+; X86-SKX: # %bb.0:
+; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
+; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0
+; X86-SKX-NEXT: retl
%p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <8 x i64> %indxs
%g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
ret <8 x i32> %g
@@ -3291,17 +3321,17 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0
; X64-KNL-NEXT: retq
;
-; X86-LABEL: test_global_array_zeroinitializer_index:
-; X86: # %bb.0:
-; X86-NEXT: kxnorw %k0, %k0, %k1
-; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
-; X86-NEXT: vmovdqa %ymm1, %ymm0
-; X86-NEXT: retl
+; X86-KNL-LABEL: test_global_array_zeroinitializer_index:
+; X86-KNL: # %bb.0:
+; X86-KNL-NEXT: kxnorw %k0, %k0, %k1
+; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0
+; X86-KNL-NEXT: retl
;
; X64-SKX-SMALL-LABEL: test_global_array_zeroinitializer_index:
; X64-SKX-SMALL: # %bb.0:
-; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0
@@ -3310,11 +3340,19 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
; X64-SKX-LARGE-LABEL: test_global_array_zeroinitializer_index:
; X64-SKX-LARGE: # %bb.0:
; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax
-; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0
; X64-SKX-LARGE-NEXT: retq
+;
+; X86-SKX-LABEL: test_global_array_zeroinitializer_index:
+; X86-SKX: # %bb.0:
+; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
+; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
+; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0
+; X86-SKX-NEXT: retl
%p = getelementptr inbounds [16 x i32], ptr @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs
%g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
ret <8 x i32> %g
@@ -3545,7 +3583,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
; X64-SKX-LABEL: sext_v8i8_index:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpmovsxbd %xmm0, %ymm1
-; X64-SKX-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; X64-SKX-NEXT: retq
@@ -3554,7 +3592,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SKX-NEXT: vpmovsxbd %xmm0, %ymm1
-; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; X86-SKX-NEXT: retl
@@ -3617,7 +3655,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
; X64-SKX-LABEL: zext_v8i8_index:
; X64-SKX: # %bb.0:
; X64-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X64-SKX-NEXT: kxnorw %k0, %k0, %k1
+; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
; X64-SKX-NEXT: retq
@@ -3626,7 +3664,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
; X86-SKX: # %bb.0:
; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
+; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; X86-SKX-NEXT: retl
@@ -4793,19 +4831,19 @@ define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) {
}
define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
-; X64-LABEL: pr163023_zext:
-; X64: # %bb.0:
-; X64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; X64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; X64-NEXT: kxnorw %k0, %k0, %k1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; X64-NEXT: kxnorw %k0, %k0, %k2
-; X64-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
-; X64-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
-; X64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
-; X64-NEXT: retq
+; X64-KNL-LABEL: pr163023_zext:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-KNL-NEXT: kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; X64-KNL-NEXT: kxnorw %k0, %k0, %k2
+; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
+; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
+; X64-KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; X64-KNL-NEXT: retq
;
; X86-LABEL: pr163023_zext:
; X86: # %bb.0:
@@ -4815,6 +4853,20 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
; X86-NEXT: vpgatherdd (%eax,%zmm0), %zmm1 {%k1}
; X86-NEXT: vmovdqa64 %zmm1, %zmm0
; X86-NEXT: retl
+;
+; X64-SKX-LABEL: pr163023_zext:
+; X64-SKX: # %bb.0:
+; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-SKX-NEXT: kxnorb %k0, %k0, %k2
+; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
+; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
+; X64-SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; X64-SKX-NEXT: retq
%addr.p = ptrtoint ptr %a0 to i64
%addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0
%addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer
@@ -4834,21 +4886,37 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
%struct.foo = type { ptr, i64, i16, i16, i32 }
define <8 x i64> @pr45906(<8 x ptr> %ptr) {
-; X64-LABEL: pr45906:
-; X64: # %bb.0: # %bb
-; X64-NEXT: kxnorw %k0, %k0, %k1
-; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
-; X64-NEXT: vmovdqa64 %zmm1, %zmm0
-; X64-NEXT: retq
+; X64-KNL-LABEL: pr45906:
+; X64-KNL: # %bb.0: # %bb
+; X64-KNL-NEXT: kxnorw %k0, %k0, %k1
+; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
+; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm0
+; X64-KNL-NEXT: retq
;
-; X86-LABEL: pr45906:
-; X86: # %bb.0: # %bb
-; X86-NEXT: kxnorw %k0, %k0, %k1
-; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; X86-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
-; X86-NEXT: vmovdqa64 %zmm1, %zmm0
-; X86-NEXT: retl
+; X86-KNL-LABEL: pr45906:
+; X86-KNL: # %bb.0: # %bb
+; X86-KNL-NEXT: kxnorw %k0, %k0, %k1
+; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-KNL-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
+; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm0
+; X86-KNL-NEXT: retl
+;
+; X64-SKX-LABEL: pr45906:
+; X64-SKX: # %bb.0: # %bb
+; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
+; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-SKX-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
+; X64-SKX-NEXT: vmovdqa64 %zmm1, %zmm0
+; X64-SKX-NEXT: retq
+;
+; X86-SKX-LABEL: pr45906:
+; X86-SKX: # %bb.0: # %bb
+; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
+; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-SKX-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
+; X86-SKX-NEXT: vmovdqa64 %zmm1, %zmm0
+; X86-SKX-NEXT: retl
bb:
%tmp = getelementptr inbounds %struct.foo, <8 x ptr> %ptr, i64 0, i32 1
%tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef)
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 18d394e..57b0577 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -4,9 +4,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512FVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512FVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512BWVL
define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i32:
@@ -350,14 +350,21 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v8i64_v8i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v8i64_v8i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512FVL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: vzeroupper
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v8i64_v8i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512BWVL-NEXT: vpmovsqd %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
%b = icmp slt <8 x i64> %x, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
%c = select <8 x i1> %b, <8 x i64> %x, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
@@ -964,9 +971,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i64_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqw %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1572,9 +1577,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i64_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1788,14 +1791,21 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v4i64_v4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1}
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v4i64_v4i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512FVL-NEXT: vpmovqd %ymm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: vzeroupper
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v4i64_v4i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512BWVL-NEXT: vpmovsqd %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
%b = icmp slt <4 x i64> %x, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
%c = select <4 x i1> %b, <4 x i64> %x, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
@@ -2141,9 +2151,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i64_v4i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqw %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
@@ -2495,9 +2503,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i64_v4i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqb %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
@@ -2641,13 +2647,19 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v2i64_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1}
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v2i64_v2i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k1
+; AVX512FVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512FVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512FVL-NEXT: vpmovqd %xmm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v2i64_v2i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
+; AVX512BWVL-NEXT: vpmovsqd %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: retq
%a = icmp ne <2 x i64> %mask, zeroinitializer
%b = icmp slt <2 x i64> %x, <i64 2147483647, i64 2147483647>
%c = select <2 x i1> %b, <2 x i64> %x, <2 x i64> <i64 2147483647, i64 2147483647>
@@ -2832,9 +2844,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX512BWVL-LABEL: truncstore_v2i64_v2i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqw %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <2 x i64> %mask, zeroinitializer
%b = icmp slt <2 x i64> %x, <i64 32767, i64 32767>
@@ -3018,9 +3028,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX512BWVL-LABEL: truncstore_v2i64_v2i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsqb %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <2 x i64> %mask, zeroinitializer
%b = icmp slt <2 x i64> %x, <i64 127, i64 127>
@@ -3816,9 +3824,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v16i32_v16i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdw %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <16 x i32> %mask, zeroinitializer
@@ -4594,9 +4600,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v16i32_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <16 x i32> %mask, zeroinitializer
@@ -5034,9 +5038,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i32_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdw %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -5473,9 +5475,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i32_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -5686,9 +5686,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i32_v4i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdw %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
%b = icmp slt <4 x i32> %x, <i32 32767, i32 32767, i32 32767, i32 32767>
@@ -5904,9 +5902,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i32_v4i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovsdb %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
%b = icmp slt <4 x i32> %x, <i32 127, i32 127, i32 127, i32 127>
@@ -7332,9 +7328,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
; AVX512BWVL-LABEL: truncstore_v32i16_v32i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovswb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <32 x i8> %mask, zeroinitializer
@@ -8083,9 +8077,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) {
; AVX512BWVL-LABEL: truncstore_v16i16_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovswb %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <16 x i8> %mask, zeroinitializer
@@ -8445,9 +8437,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i16_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmaxsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovswb %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i16> %mask, zeroinitializer
%b = icmp slt <8 x i16> %x, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 4c4b6e7..0386d95 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -4,9 +4,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512FVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512FVL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512BWVL
define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i32:
@@ -281,13 +281,20 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v8i64_v8i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v8i64_v8i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512FVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512FVL-NEXT: vpmovqd %zmm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: vzeroupper
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v8i64_v8i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
+; AVX512BWVL-NEXT: vpmovusqd %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
%b = icmp ult <8 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%c = select <8 x i1> %b, <8 x i64> %x, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -829,8 +836,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i64_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovqw %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusqw %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1367,8 +1373,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i64_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovqb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusqb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -1547,13 +1552,20 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v4i64_v4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1}
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v4i64_v4i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512FVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
+; AVX512FVL-NEXT: vpmovqd %ymm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: vzeroupper
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v4i64_v4i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
+; AVX512BWVL-NEXT: vpmovusqd %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
%b = icmp ult <4 x i64> %x, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
%c = select <4 x i1> %b, <4 x i64> %x, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -1868,8 +1880,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i64_v4i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusqw %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
@@ -2188,8 +2199,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i64_v4i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusqb %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
@@ -2304,12 +2314,18 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
-; AVX512VL-LABEL: truncstore_v2i64_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1
-; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1}
-; AVX512VL-NEXT: retq
+; AVX512FVL-LABEL: truncstore_v2i64_v2i32:
+; AVX512FVL: # %bb.0:
+; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k1
+; AVX512FVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512FVL-NEXT: vpmovqd %xmm0, (%rdi) {%k1}
+; AVX512FVL-NEXT: retq
+;
+; AVX512BWVL-LABEL: truncstore_v2i64_v2i32:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
+; AVX512BWVL-NEXT: vpmovusqd %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: retq
%a = icmp ne <2 x i64> %mask, zeroinitializer
%b = icmp ult <2 x i64> %x, <i64 4294967295, i64 4294967295>
%c = select <2 x i1> %b, <2 x i64> %x, <2 x i64> <i64 4294967295, i64 4294967295>
@@ -2470,8 +2486,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX512BWVL-LABEL: truncstore_v2i64_v2i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusqw %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <2 x i64> %mask, zeroinitializer
%b = icmp ult <2 x i64> %x, <i64 65535, i64 65535>
@@ -2630,8 +2645,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX512BWVL-LABEL: truncstore_v2i64_v2i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmq %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusqb %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <2 x i64> %mask, zeroinitializer
%b = icmp ult <2 x i64> %x, <i64 255, i64 255>
@@ -3457,8 +3471,7 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v16i32_v16i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovdw %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusdw %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <16 x i32> %mask, zeroinitializer
@@ -4273,8 +4286,7 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v16i32_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovdb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusdb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <16 x i32> %mask, zeroinitializer
@@ -4737,8 +4749,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i32_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusdw %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -5194,8 +5205,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i32_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i32> %mask, zeroinitializer
@@ -5455,8 +5465,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i32_v4i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusdw %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
%b = icmp ult <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -5717,8 +5726,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; AVX512BWVL-LABEL: truncstore_v4i32_v4i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmd %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovusdb %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <4 x i32> %mask, zeroinitializer
%b = icmp ult <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
@@ -7171,8 +7179,7 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) {
; AVX512BWVL-LABEL: truncstore_v32i16_v32i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmb %ymm1, %ymm1, %k1
-; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovuswb %zmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <32 x i8> %mask, zeroinitializer
@@ -7935,8 +7942,7 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) {
; AVX512BWVL-LABEL: truncstore_v16i16_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmb %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovuswb %ymm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
%a = icmp ne <16 x i8> %mask, zeroinitializer
@@ -8302,8 +8308,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
; AVX512BWVL-LABEL: truncstore_v8i16_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vptestmw %xmm1, %xmm1, %k1
-; AVX512BWVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rdi) {%k1}
+; AVX512BWVL-NEXT: vpmovuswb %xmm0, (%rdi) {%k1}
; AVX512BWVL-NEXT: retq
%a = icmp ne <8 x i16> %mask, zeroinitializer
%b = icmp ult <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 388d852..f38b769 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -368,46 +368,47 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9
; AVX512F-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
; AVX512F-NEXT: vmulss %xmm1, %xmm4, %xmm10
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; AVX512F-NEXT: vmulss %xmm6, %xmm5, %xmm6
-; AVX512F-NEXT: vaddss %xmm6, %xmm10, %xmm6
-; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm10
-; AVX512F-NEXT: vmulss %xmm8, %xmm10, %xmm8
-; AVX512F-NEXT: vaddss %xmm6, %xmm8, %xmm6
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm6 = xmm9[0,1],xmm6[0],xmm9[3]
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm5[1,1,3,3]
+; AVX512F-NEXT: vmulss %xmm6, %xmm11, %xmm5
+; AVX512F-NEXT: vaddss %xmm5, %xmm10, %xmm5
+; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm6
+; AVX512F-NEXT: vmulss %xmm6, %xmm8, %xmm8
+; AVX512F-NEXT: vaddss %xmm5, %xmm8, %xmm5
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm5[0],xmm9[3]
; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm8
; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm9
-; AVX512F-NEXT: vmovsldup {{.*#+}} xmm11 = xmm9[0,0,2,2]
-; AVX512F-NEXT: vmulps %xmm2, %xmm11, %xmm11
-; AVX512F-NEXT: vaddps %xmm11, %xmm8, %xmm8
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm9[1,1,3,3]
-; AVX512F-NEXT: vmulps %xmm3, %xmm11, %xmm12
+; AVX512F-NEXT: vmovsldup {{.*#+}} xmm10 = xmm9[0,0,2,2]
+; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm10
+; AVX512F-NEXT: vaddps %xmm10, %xmm8, %xmm8
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm9[1,1,3,3]
+; AVX512F-NEXT: vmulps %xmm3, %xmm10, %xmm12
; AVX512F-NEXT: vaddps %xmm12, %xmm8, %xmm8
; AVX512F-NEXT: vmulss %xmm7, %xmm4, %xmm7
-; AVX512F-NEXT: vmulss %xmm5, %xmm9, %xmm12
+; AVX512F-NEXT: vmulss %xmm9, %xmm11, %xmm12
; AVX512F-NEXT: vaddss %xmm7, %xmm12, %xmm7
-; AVX512F-NEXT: vmulss %xmm11, %xmm10, %xmm11
-; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3]
-; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3]
-; AVX512F-NEXT: vshufpd {{.*#+}} xmm11 = xmm9[1,0]
+; AVX512F-NEXT: vmulss %xmm6, %xmm10, %xmm10
+; AVX512F-NEXT: vaddss %xmm7, %xmm10, %xmm7
+; AVX512F-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3]
+; AVX512F-NEXT: vshufpd {{.*#+}} xmm12 = xmm9[1,0]
; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2]
; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0
-; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2
+; AVX512F-NEXT: vmulps %xmm2, %xmm10, %xmm2
; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm1
; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2
; AVX512F-NEXT: vmulps %xmm2, %xmm3, %xmm2
; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vmulss %xmm4, %xmm11, %xmm2
-; AVX512F-NEXT: vmulss %xmm5, %xmm8, %xmm3
+; AVX512F-NEXT: vmulss %xmm4, %xmm12, %xmm2
+; AVX512F-NEXT: vmulss %xmm10, %xmm11, %xmm3
; AVX512F-NEXT: vaddss %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vmulss %xmm1, %xmm10, %xmm1
+; AVX512F-NEXT: vmulss %xmm1, %xmm6, %xmm1
; AVX512F-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm2
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
-; AVX512F-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm8[1,1,3,3]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[2,3]
+; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1,2],xmm8[0]
+; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX512F-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_mul3x3_f32:
@@ -447,26 +448,27 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
; AVX512VL-NEXT: vaddss %xmm7, %xmm12, %xmm7
; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11
; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3]
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm8[1,0]
+; AVX512VL-NEXT: vshufps {{.*#+}} xmm11 = xmm8[3,3,3,3]
+; AVX512VL-NEXT: vshufpd {{.*#+}} xmm12 = xmm8[1,0]
; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2]
; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2
+; AVX512VL-NEXT: vmulps %xmm2, %xmm11, %xmm2
; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0
; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm1
; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2
; AVX512VL-NEXT: vmulps %xmm2, %xmm6, %xmm2
; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vmulss %xmm11, %xmm9, %xmm2
-; AVX512VL-NEXT: vmulss %xmm7, %xmm4, %xmm4
+; AVX512VL-NEXT: vmulss %xmm12, %xmm9, %xmm2
+; AVX512VL-NEXT: vmulss %xmm4, %xmm11, %xmm4
; AVX512VL-NEXT: vaddss %xmm4, %xmm2, %xmm2
; AVX512VL-NEXT: vmulss %xmm1, %xmm10, %xmm1
; AVX512VL-NEXT: vaddss %xmm1, %xmm2, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm3, %zmm2
-; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
-; AVX512VL-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm5[1,1,3,3]
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[2,3]
+; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm5[0]
+; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
; AVX512VL-NEXT: retq
entry:
%block = shufflevector <9 x float> %a0, <9 x float> poison, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 595f849..d8be4cf 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
;
; 32-bit SSE tests to make sure we do reasonable things.
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86-SSE,X86-SSE1
@@ -353,6 +353,39 @@ define <4 x float> @merge_4f32_f32_019u(ptr %ptr) nounwind uwtable noinline ssp
ret <4 x float> %res3
}
+define <4 x float> @merge_v4f32_f32_3210(ptr %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_v4f32_f32_3210:
+; SSE: # %bb.0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_v4f32_f32_3210:
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; AVX-NEXT: retq
+;
+; X86-SSE-LABEL: merge_v4f32_f32_3210:
+; X86-SSE: # %bb.0:
+; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT: movups (%eax), %xmm0
+; X86-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X86-SSE-NEXT: retl
+ %ptr0 = getelementptr inbounds float, ptr %ptr, i64 3
+ %ptr1 = getelementptr inbounds float, ptr %ptr, i64 2
+ %ptr2 = getelementptr inbounds float, ptr %ptr, i64 1
+ %ptr3 = getelementptr inbounds float, ptr %ptr, i64 0
+ %val0 = load float, ptr %ptr0, align 4
+ %val1 = load float, ptr %ptr1, align 4
+ %val2 = load float, ptr %ptr2, align 4
+ %val3 = load float, ptr %ptr3, align 4
+ %res0 = insertelement <4 x float> poison, float %val0, i64 0
+ %res1 = insertelement <4 x float> %res0, float %val1, i64 1
+ %res2 = insertelement <4 x float> %res1, float %val2, i64 2
+ %res3 = insertelement <4 x float> %res2, float %val3, i64 3
+ ret <4 x float> %res3
+}
+
define <4 x i32> @merge_4i32_i32_23u5(ptr %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_4i32_i32_23u5:
; SSE: # %bb.0:
@@ -724,6 +757,63 @@ define <4 x i32> @merge_4i32_i32_45zz_inc5(ptr %ptr) nounwind uwtable noinline s
ret <4 x i32> %res1
}
+define <4 x i32> @merge_v4i32_i32_3210(ptr %ptr) nounwind uwtable noinline ssp {
+; SSE-LABEL: merge_v4i32_i32_3210:
+; SSE: # %bb.0:
+; SSE-NEXT: movdqu (%rdi), %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: merge_v4i32_i32_3210:
+; AVX: # %bb.0:
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0]
+; AVX-NEXT: retq
+;
+; X86-SSE1-LABEL: merge_v4i32_i32_3210:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl %edi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT: pushl %esi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X86-SSE1-NEXT: .cfi_offset %esi, -12
+; X86-SSE1-NEXT: .cfi_offset %edi, -8
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT: movl 12(%ecx), %edx
+; X86-SSE1-NEXT: movl 8(%ecx), %esi
+; X86-SSE1-NEXT: movl (%ecx), %edi
+; X86-SSE1-NEXT: movl 4(%ecx), %ecx
+; X86-SSE1-NEXT: movl %edi, 12(%eax)
+; X86-SSE1-NEXT: movl %ecx, 8(%eax)
+; X86-SSE1-NEXT: movl %esi, 4(%eax)
+; X86-SSE1-NEXT: movl %edx, (%eax)
+; X86-SSE1-NEXT: popl %esi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT: popl %edi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT: retl $4
+;
+; X86-SSE41-LABEL: merge_v4i32_i32_3210:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movdqu (%eax), %xmm0
+; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; X86-SSE41-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 3
+ %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 2
+ %ptr2 = getelementptr inbounds i32, ptr %ptr, i64 1
+ %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 0
+ %val0 = load i32, ptr %ptr0, align 4
+ %val1 = load i32, ptr %ptr1, align 4
+ %val2 = load i32, ptr %ptr2, align 4
+ %val3 = load i32, ptr %ptr3, align 4
+ %res0 = insertelement <4 x i32> poison, i32 %val0, i64 0
+ %res1 = insertelement <4 x i32> %res0, i32 %val1, i64 1
+ %res2 = insertelement <4 x i32> %res1, i32 %val2, i64 2
+ %res3 = insertelement <4 x i32> %res2, i32 %val3, i64 3
+ ret <4 x i32> %res3
+}
+
define <8 x i16> @merge_8i16_i16_23u567u9(ptr %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_8i16_i16_23u567u9:
; SSE: # %bb.0:
@@ -862,6 +952,110 @@ define <8 x i16> @merge_8i16_i16_45u7zzzz(ptr %ptr) nounwind uwtable noinline ss
ret <8 x i16> %res7
}
+define <8 x i16> @merge_8i16_i16_76543210(ptr %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_8i16_i16_76543210:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_8i16_i16_76543210:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqu (%rdi), %xmm0
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_8i16_i16_76543210:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; AVX-NEXT: retq
+;
+; X86-SSE1-LABEL: merge_8i16_i16_76543210:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl %ebp
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT: pushl %ebx
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X86-SSE1-NEXT: pushl %edi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 16
+; X86-SSE1-NEXT: pushl %esi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 20
+; X86-SSE1-NEXT: pushl %eax
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 24
+; X86-SSE1-NEXT: .cfi_offset %esi, -20
+; X86-SSE1-NEXT: .cfi_offset %edi, -16
+; X86-SSE1-NEXT: .cfi_offset %ebx, -12
+; X86-SSE1-NEXT: .cfi_offset %ebp, -8
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT: movzwl 14(%eax), %ecx
+; X86-SSE1-NEXT: movw %cx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; X86-SSE1-NEXT: movzwl 12(%eax), %ecx
+; X86-SSE1-NEXT: movw %cx, (%esp) # 2-byte Spill
+; X86-SSE1-NEXT: movzwl 10(%eax), %esi
+; X86-SSE1-NEXT: movzwl 8(%eax), %edi
+; X86-SSE1-NEXT: movzwl 6(%eax), %ebx
+; X86-SSE1-NEXT: movzwl 4(%eax), %ebp
+; X86-SSE1-NEXT: movzwl (%eax), %ecx
+; X86-SSE1-NEXT: movzwl 2(%eax), %edx
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT: movw %cx, 14(%eax)
+; X86-SSE1-NEXT: movw %dx, 12(%eax)
+; X86-SSE1-NEXT: movw %bp, 10(%eax)
+; X86-SSE1-NEXT: movw %bx, 8(%eax)
+; X86-SSE1-NEXT: movw %di, 6(%eax)
+; X86-SSE1-NEXT: movw %si, 4(%eax)
+; X86-SSE1-NEXT: movzwl (%esp), %ecx # 2-byte Folded Reload
+; X86-SSE1-NEXT: movw %cx, 2(%eax)
+; X86-SSE1-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 2-byte Folded Reload
+; X86-SSE1-NEXT: movw %cx, (%eax)
+; X86-SSE1-NEXT: addl $4, %esp
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 20
+; X86-SSE1-NEXT: popl %esi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 16
+; X86-SSE1-NEXT: popl %edi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X86-SSE1-NEXT: popl %ebx
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT: popl %ebp
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT: retl $4
+;
+; X86-SSE41-LABEL: merge_8i16_i16_76543210:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movdqu (%eax), %xmm0
+; X86-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; X86-SSE41-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 7
+ %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 6
+ %ptr2 = getelementptr inbounds i16, ptr %ptr, i64 5
+ %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 4
+ %ptr4 = getelementptr inbounds i16, ptr %ptr, i64 3
+ %ptr5 = getelementptr inbounds i16, ptr %ptr, i64 2
+ %ptr6 = getelementptr inbounds i16, ptr %ptr, i64 1
+ %ptr7 = getelementptr inbounds i16, ptr %ptr, i64 0
+ %val0 = load i16, ptr %ptr0
+ %val1 = load i16, ptr %ptr1
+ %val2 = load i16, ptr %ptr2
+ %val3 = load i16, ptr %ptr3
+ %val4 = load i16, ptr %ptr4
+ %val5 = load i16, ptr %ptr5
+ %val6 = load i16, ptr %ptr6
+ %val7 = load i16, ptr %ptr7
+ %res0 = insertelement <8 x i16> poison, i16 %val0, i64 0
+ %res1 = insertelement <8 x i16> %res0, i16 %val1, i64 1
+ %res2 = insertelement <8 x i16> %res1, i16 %val2, i64 2
+ %res3 = insertelement <8 x i16> %res2, i16 %val3, i64 3
+ %res4 = insertelement <8 x i16> %res3, i16 %val4, i64 4
+ %res5 = insertelement <8 x i16> %res4, i16 %val5, i64 5
+ %res6 = insertelement <8 x i16> %res5, i16 %val6, i64 6
+ %res7 = insertelement <8 x i16> %res6, i16 %val7, i64 7
+ ret <8 x i16> %res7
+}
+
define <16 x i8> @merge_16i8_i8_01u3456789ABCDuF(ptr %ptr) nounwind uwtable noinline ssp {
; SSE-LABEL: merge_16i8_i8_01u3456789ABCDuF:
; SSE: # %bb.0:
@@ -1056,6 +1250,164 @@ define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(ptr %ptr) nounwind uwtable noin
ret <16 x i8> %resF
}
+define <16 x i8> @merge_16i8_i8_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
+; SSE2-LABEL: merge_16i8_i8_FEDCBA9876543210:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqu (%rdi), %xmm0
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: merge_16i8_i8_FEDCBA9876543210:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; AVX-NEXT: retq
+;
+; X86-SSE1-LABEL: merge_16i8_i8_FEDCBA9876543210:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl %ebx
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT: pushl %esi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X86-SSE1-NEXT: subl $12, %esp
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 24
+; X86-SSE1-NEXT: .cfi_offset %esi, -12
+; X86-SSE1-NEXT: .cfi_offset %ebx, -8
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-SSE1-NEXT: movzbl 15(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movzbl 14(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movzbl 13(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movzbl 12(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movzbl 11(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movzbl 10(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movzbl 9(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movzbl 8(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movzbl 7(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movzbl 6(%esi), %ecx
+; X86-SSE1-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SSE1-NEXT: movb 5(%esi), %bh
+; X86-SSE1-NEXT: movb 4(%esi), %bl
+; X86-SSE1-NEXT: movb 3(%esi), %dh
+; X86-SSE1-NEXT: movb 2(%esi), %ch
+; X86-SSE1-NEXT: movb (%esi), %cl
+; X86-SSE1-NEXT: movb 1(%esi), %dl
+; X86-SSE1-NEXT: movb %cl, 15(%eax)
+; X86-SSE1-NEXT: movb %dl, 14(%eax)
+; X86-SSE1-NEXT: movb %ch, 13(%eax)
+; X86-SSE1-NEXT: movb %dh, 12(%eax)
+; X86-SSE1-NEXT: movb %bl, 11(%eax)
+; X86-SSE1-NEXT: movb %bh, 10(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, 9(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, 8(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, 7(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, 6(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, 5(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, 4(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, 3(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, 2(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, 1(%eax)
+; X86-SSE1-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SSE1-NEXT: movb %cl, (%eax)
+; X86-SSE1-NEXT: addl $12, %esp
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 12
+; X86-SSE1-NEXT: popl %esi
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 8
+; X86-SSE1-NEXT: popl %ebx
+; X86-SSE1-NEXT: .cfi_def_cfa_offset 4
+; X86-SSE1-NEXT: retl $4
+;
+; X86-SSE41-LABEL: merge_16i8_i8_FEDCBA9876543210:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movdqu (%eax), %xmm0
+; X86-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
+; X86-SSE41-NEXT: retl
+ %ptr0 = getelementptr inbounds i8, ptr %ptr, i64 15
+ %ptr1 = getelementptr inbounds i8, ptr %ptr, i64 14
+ %ptr2 = getelementptr inbounds i8, ptr %ptr, i64 13
+ %ptr3 = getelementptr inbounds i8, ptr %ptr, i64 12
+ %ptr4 = getelementptr inbounds i8, ptr %ptr, i64 11
+ %ptr5 = getelementptr inbounds i8, ptr %ptr, i64 10
+ %ptr6 = getelementptr inbounds i8, ptr %ptr, i64 9
+ %ptr7 = getelementptr inbounds i8, ptr %ptr, i64 8
+ %ptr8 = getelementptr inbounds i8, ptr %ptr, i64 7
+ %ptr9 = getelementptr inbounds i8, ptr %ptr, i64 6
+ %ptrA = getelementptr inbounds i8, ptr %ptr, i64 5
+ %ptrB = getelementptr inbounds i8, ptr %ptr, i64 4
+ %ptrC = getelementptr inbounds i8, ptr %ptr, i64 3
+ %ptrD = getelementptr inbounds i8, ptr %ptr, i64 2
+ %ptrE = getelementptr inbounds i8, ptr %ptr, i64 1
+ %ptrF = getelementptr inbounds i8, ptr %ptr, i64 0
+ %val0 = load i8, ptr %ptr0
+ %val1 = load i8, ptr %ptr1
+ %val2 = load i8, ptr %ptr2
+ %val3 = load i8, ptr %ptr3
+ %val4 = load i8, ptr %ptr4
+ %val5 = load i8, ptr %ptr5
+ %val6 = load i8, ptr %ptr6
+ %val7 = load i8, ptr %ptr7
+ %val8 = load i8, ptr %ptr8
+ %val9 = load i8, ptr %ptr9
+ %valA = load i8, ptr %ptrA
+ %valB = load i8, ptr %ptrB
+ %valC = load i8, ptr %ptrC
+ %valD = load i8, ptr %ptrD
+ %valE = load i8, ptr %ptrE
+ %valF = load i8, ptr %ptrF
+ %res0 = insertelement <16 x i8> poison, i8 %val0, i8 0
+ %res1 = insertelement <16 x i8> %res0, i8 %val1, i64 1
+ %res2 = insertelement <16 x i8> %res1, i8 %val2, i64 2
+ %res3 = insertelement <16 x i8> %res2, i8 %val3, i64 3
+ %res4 = insertelement <16 x i8> %res3, i8 %val4, i64 4
+ %res5 = insertelement <16 x i8> %res4, i8 %val5, i64 5
+ %res6 = insertelement <16 x i8> %res5, i8 %val6, i64 6
+ %res7 = insertelement <16 x i8> %res6, i8 %val7, i64 7
+ %res8 = insertelement <16 x i8> %res7, i8 %val8, i64 8
+ %res9 = insertelement <16 x i8> %res8, i8 %val9, i64 9
+ %resA = insertelement <16 x i8> %res9, i8 %valA, i64 10
+ %resB = insertelement <16 x i8> %resA, i8 %valB, i64 11
+ %resC = insertelement <16 x i8> %resB, i8 %valC, i64 12
+ %resD = insertelement <16 x i8> %resC, i8 %valD, i64 13
+ %resE = insertelement <16 x i8> %resD, i8 %valE, i64 14
+ %resF = insertelement <16 x i8> %resE, i8 %valF, i64 15
+ ret <16 x i8> %resF
+}
+
define void @merge_4i32_i32_combine(ptr %dst, ptr %src) {
; SSE-LABEL: merge_4i32_i32_combine:
; SSE: # %bb.0:
@@ -1285,3 +1637,90 @@ define <4 x i32> @load_i32_zext_i128_v4i32(ptr %ptr) {
%3 = bitcast i128 %2 to <4 x i32>
ret <4 x i32> %3
}
+
+; Don't attempt to reverse a partial VZEXT_LOAD
+define <4 x i32> @no_reverse_vzload(ptr %p0) nounwind {
+; SSE2-LABEL: no_reverse_vzload:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE2-NEXT: pxor %xmm2, %xmm2
+; SSE2-NEXT: paddd %xmm1, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: no_reverse_vzload:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE41-NEXT: pxor %xmm2, %xmm2
+; SSE41-NEXT: paddd %xmm1, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: no_reverse_vzload:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: no_reverse_vzload:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: no_reverse_vzload:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: retq
+;
+; X86-SSE1-LABEL: no_reverse_vzload:
+; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl %ebx
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-SSE1-NEXT: xorl %ecx, %ecx
+; X86-SSE1-NEXT: cmpl $0, (%edx)
+; X86-SSE1-NEXT: setg %cl
+; X86-SSE1-NEXT: negl %ecx
+; X86-SSE1-NEXT: xorl %ebx, %ebx
+; X86-SSE1-NEXT: cmpl $0, 4(%edx)
+; X86-SSE1-NEXT: setg %bl
+; X86-SSE1-NEXT: negl %ebx
+; X86-SSE1-NEXT: movl %ebx, 4(%eax)
+; X86-SSE1-NEXT: movl %ecx, (%eax)
+; X86-SSE1-NEXT: movl $0, 12(%eax)
+; X86-SSE1-NEXT: movl $0, 8(%eax)
+; X86-SSE1-NEXT: popl %ebx
+; X86-SSE1-NEXT: retl $4
+;
+; X86-SSE41-LABEL: no_reverse_vzload:
+; X86-SSE41: # %bb.0:
+; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE41-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0]
+; X86-SSE41-NEXT: pxor %xmm2, %xmm2
+; X86-SSE41-NEXT: paddd %xmm1, %xmm1
+; X86-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; X86-SSE41-NEXT: pcmpgtd %xmm1, %xmm0
+; X86-SSE41-NEXT: retl
+ %i0 = load <2 x i32>, ptr %p0, align 4
+ %i1 = shufflevector <2 x i32> %i0, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ %i2 = shufflevector <4 x i32> zeroinitializer, <4 x i32> %i1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+ %i3 = shl <4 x i32> %i2, <i32 4, i32 4, i32 1, i32 1>
+ %i4 = shufflevector <4 x i32> %i1, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ %i5 = icmp slt <4 x i32> %i3, %i4
+ %i6 = sext <4 x i1> %i5 to <4 x i32>
+ ret <4 x i32> %i6
+}
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll
index 33e8d62..6ad306d 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll
@@ -126,6 +126,44 @@ define <4 x double> @merge_4f64_f64_45zz(ptr %ptr) nounwind uwtable noinline ssp
ret <4 x double> %res1
}
+define <4 x double> @merge_v4f64_f64_3210(ptr %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_v4f64_f64_3210:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_v4f64_f64_3210:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_v4f64_f64_3210:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512F-NEXT: retq
+;
+; X86-AVX-LABEL: merge_v4f64_f64_3210:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; X86-AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; X86-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds double, ptr %ptr, i64 3
+ %ptr1 = getelementptr inbounds double, ptr %ptr, i64 2
+ %ptr2 = getelementptr inbounds double, ptr %ptr, i64 1
+ %ptr3 = getelementptr inbounds double, ptr %ptr, i64 0
+ %val0 = load double, ptr %ptr0, align 4
+ %val1 = load double, ptr %ptr1, align 4
+ %val2 = load double, ptr %ptr2, align 4
+ %val3 = load double, ptr %ptr3, align 4
+ %res0 = insertelement <4 x double> poison, double %val0, i64 0
+ %res1 = insertelement <4 x double> %res0, double %val1, i64 1
+ %res2 = insertelement <4 x double> %res1, double %val2, i64 2
+ %res3 = insertelement <4 x double> %res2, double %val3, i64 3
+ ret <4 x double> %res3
+}
+
define <4 x double> @merge_4f64_f64_34z6(ptr %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_4f64_f64_34z6:
; AVX: # %bb.0:
@@ -234,6 +272,51 @@ define <4 x i64> @merge_4i64_i64_23zz(ptr %ptr) nounwind uwtable noinline ssp {
ret <4 x i64> %res1
}
+define <4 x i64> @merge_v4i64_i64_3210(ptr %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_v4i64_i64_3210:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_v4i64_i64_3210:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_v4i64_i64_3210:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,1,0]
+; AVX512F-NEXT: retq
+;
+; X86-AVX-LABEL: merge_v4i64_i64_3210:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpinsrd $1, 12(%eax), %xmm0, %xmm0
+; X86-AVX-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
+; X86-AVX-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
+; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT: vpinsrd $1, 28(%eax), %xmm1, %xmm1
+; X86-AVX-NEXT: vpinsrd $2, 16(%eax), %xmm1, %xmm1
+; X86-AVX-NEXT: vpinsrd $3, 20(%eax), %xmm1, %xmm1
+; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 3
+ %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 2
+ %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 1
+ %ptr3 = getelementptr inbounds i64, ptr %ptr, i64 0
+ %val0 = load i64, ptr %ptr0, align 4
+ %val1 = load i64, ptr %ptr1, align 4
+ %val2 = load i64, ptr %ptr2, align 4
+ %val3 = load i64, ptr %ptr3, align 4
+ %res0 = insertelement <4 x i64> poison, i64 %val0, i64 0
+ %res1 = insertelement <4 x i64> %res0, i64 %val1, i64 1
+ %res2 = insertelement <4 x i64> %res1, i64 %val2, i64 2
+ %res3 = insertelement <4 x i64> %res2, i64 %val3, i64 3
+ ret <4 x i64> %res3
+}
+
define <8 x float> @merge_8f32_2f32_23z5(ptr %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_8f32_2f32_23z5:
; AVX: # %bb.0:
@@ -335,6 +418,58 @@ define <8 x float> @merge_8f32_f32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline
ret <8 x float> %res7
}
+define <8 x float> @merge_8f32_f32_76543210(ptr %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_8f32_f32_76543210:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_8f32_f32_76543210:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_8f32_f32_76543210:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; X86-AVX-LABEL: merge_8f32_f32_76543210:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; X86-AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; X86-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds float, ptr %ptr, i64 7
+ %ptr1 = getelementptr inbounds float, ptr %ptr, i64 6
+ %ptr2 = getelementptr inbounds float, ptr %ptr, i64 5
+ %ptr3 = getelementptr inbounds float, ptr %ptr, i64 4
+ %ptr4 = getelementptr inbounds float, ptr %ptr, i64 3
+ %ptr5 = getelementptr inbounds float, ptr %ptr, i64 2
+ %ptr6 = getelementptr inbounds float, ptr %ptr, i64 1
+ %ptr7 = getelementptr inbounds float, ptr %ptr, i64 0
+ %val0 = load float, ptr %ptr0
+ %val1 = load float, ptr %ptr1
+ %val2 = load float, ptr %ptr2
+ %val3 = load float, ptr %ptr3
+ %val4 = load float, ptr %ptr4
+ %val5 = load float, ptr %ptr5
+ %val6 = load float, ptr %ptr6
+ %val7 = load float, ptr %ptr7
+ %res0 = insertelement <8 x float> poison, float %val0, i64 0
+ %res1 = insertelement <8 x float> %res0, float %val1, i64 1
+ %res2 = insertelement <8 x float> %res1, float %val2, i64 2
+ %res3 = insertelement <8 x float> %res2, float %val3, i64 3
+ %res4 = insertelement <8 x float> %res3, float %val4, i64 4
+ %res5 = insertelement <8 x float> %res4, float %val5, i64 5
+ %res6 = insertelement <8 x float> %res5, float %val6, i64 6
+ %res7 = insertelement <8 x float> %res6, float %val7, i64 7
+ ret <8 x float> %res7
+}
+
define <8 x i32> @merge_8i32_4i32_z3(ptr %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_8i32_4i32_z3:
; AVX: # %bb.0:
@@ -414,6 +549,58 @@ define <8 x i32> @merge_8i32_i32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss
ret <8 x i32> %res7
}
+define <8 x i32> @merge_8i32_i32_76543210(ptr %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_8i32_i32_76543210:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_8i32_i32_76543210:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_8i32_i32_76543210:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; X86-AVX-LABEL: merge_8i32_i32_76543210:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
+; X86-AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; X86-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 7
+ %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 6
+ %ptr2 = getelementptr inbounds i32, ptr %ptr, i64 5
+ %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 4
+ %ptr4 = getelementptr inbounds i32, ptr %ptr, i64 3
+ %ptr5 = getelementptr inbounds i32, ptr %ptr, i64 2
+ %ptr6 = getelementptr inbounds i32, ptr %ptr, i64 1
+ %ptr7 = getelementptr inbounds i32, ptr %ptr, i64 0
+ %val0 = load i32, ptr %ptr0
+ %val1 = load i32, ptr %ptr1
+ %val2 = load i32, ptr %ptr2
+ %val3 = load i32, ptr %ptr3
+ %val4 = load i32, ptr %ptr4
+ %val5 = load i32, ptr %ptr5
+ %val6 = load i32, ptr %ptr6
+ %val7 = load i32, ptr %ptr7
+ %res0 = insertelement <8 x i32> poison, i32 %val0, i64 0
+ %res1 = insertelement <8 x i32> %res0, i32 %val1, i64 1
+ %res2 = insertelement <8 x i32> %res1, i32 %val2, i64 2
+ %res3 = insertelement <8 x i32> %res2, i32 %val3, i64 3
+ %res4 = insertelement <8 x i32> %res3, i32 %val4, i64 4
+ %res5 = insertelement <8 x i32> %res4, i32 %val5, i64 5
+ %res6 = insertelement <8 x i32> %res5, i32 %val6, i64 6
+ %res7 = insertelement <8 x i32> %res6, i32 %val7, i64 7
+ ret <8 x i32> %res7
+}
+
define <16 x i16> @merge_16i16_i16_89zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_16i16_i16_89zzzuuuuuuuuuuuz:
; AVX: # %bb.0:
@@ -522,6 +709,92 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n
ret <16 x i16> %resF
}
+define <16 x i16> @merge_16i16_i16_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
+; AVX1-LABEL: merge_16i16_i16_FEDCBA9876543210:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqu (%rdi), %xmm0
+; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: merge_16i16_i16_FEDCBA9876543210:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: merge_16i16_i16_FEDCBA9876543210:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25,22,23,20,21,18,19,16,17]
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: retq
+;
+; X86-AVX-LABEL: merge_16i16_i16_FEDCBA9876543210:
+; X86-AVX: # %bb.0:
+; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT: vmovdqu (%eax), %xmm0
+; X86-AVX-NEXT: vmovdqu 16(%eax), %xmm1
+; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1]
+; X86-AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; X86-AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX-NEXT: retl
+ %ptr0 = getelementptr inbounds i16, ptr %ptr, i64 15
+ %ptr1 = getelementptr inbounds i16, ptr %ptr, i64 14
+ %ptr2 = getelementptr inbounds i16, ptr %ptr, i64 13
+ %ptr3 = getelementptr inbounds i16, ptr %ptr, i64 12
+ %ptr4 = getelementptr inbounds i16, ptr %ptr, i64 11
+ %ptr5 = getelementptr inbounds i16, ptr %ptr, i64 10
+ %ptr6 = getelementptr inbounds i16, ptr %ptr, i64 9
+ %ptr7 = getelementptr inbounds i16, ptr %ptr, i64 8
+ %ptr8 = getelementptr inbounds i16, ptr %ptr, i64 7
+ %ptr9 = getelementptr inbounds i16, ptr %ptr, i64 6
+ %ptrA = getelementptr inbounds i16, ptr %ptr, i64 5
+ %ptrB = getelementptr inbounds i16, ptr %ptr, i64 4
+ %ptrC = getelementptr inbounds i16, ptr %ptr, i64 3
+ %ptrD = getelementptr inbounds i16, ptr %ptr, i64 2
+ %ptrE = getelementptr inbounds i16, ptr %ptr, i64 1
+ %ptrF = getelementptr inbounds i16, ptr %ptr, i64 0
+ %val0 = load i16, ptr %ptr0
+ %val1 = load i16, ptr %ptr1
+ %val2 = load i16, ptr %ptr2
+ %val3 = load i16, ptr %ptr3
+ %val4 = load i16, ptr %ptr4
+ %val5 = load i16, ptr %ptr5
+ %val6 = load i16, ptr %ptr6
+ %val7 = load i16, ptr %ptr7
+ %val8 = load i16, ptr %ptr8
+ %val9 = load i16, ptr %ptr9
+ %valA = load i16, ptr %ptrA
+ %valB = load i16, ptr %ptrB
+ %valC = load i16, ptr %ptrC
+ %valD = load i16, ptr %ptrD
+ %valE = load i16, ptr %ptrE
+ %valF = load i16, ptr %ptrF
+ %res0 = insertelement <16 x i16> poison, i16 %val0, i64 0
+ %res1 = insertelement <16 x i16> %res0, i16 %val1, i64 1
+ %res2 = insertelement <16 x i16> %res1, i16 %val2, i64 2
+ %res3 = insertelement <16 x i16> %res2, i16 %val3, i64 3
+ %res4 = insertelement <16 x i16> %res3, i16 %val4, i64 4
+ %res5 = insertelement <16 x i16> %res4, i16 %val5, i64 5
+ %res6 = insertelement <16 x i16> %res5, i16 %val6, i64 6
+ %res7 = insertelement <16 x i16> %res6, i16 %val7, i64 7
+ %res8 = insertelement <16 x i16> %res7, i16 %val8, i64 8
+ %res9 = insertelement <16 x i16> %res8, i16 %val9, i64 9
+ %resA = insertelement <16 x i16> %res9, i16 %valA, i64 10
+ %resB = insertelement <16 x i16> %resA, i16 %valB, i64 11
+ %resC = insertelement <16 x i16> %resB, i16 %valC, i64 12
+ %resD = insertelement <16 x i16> %resC, i16 %valD, i64 13
+ %resE = insertelement <16 x i16> %resD, i16 %valE, i64 14
+ %resF = insertelement <16 x i16> %resE, i16 %valF, i64 15
+ ret <16 x i16> %resF
+}
+
define <32 x i8> @merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(ptr %ptr) nounwind uwtable noinline ssp {
; AVX-LABEL: merge_32i8_i8_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu:
; AVX: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
index 790bed4..f9a0bd7 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
@@ -148,6 +148,46 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline
ret <8 x double> %res7
}
+define <8 x double> @merge_8f64_f64_76543210(ptr %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8f64_f64_76543210:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; ALL-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X86-AVX512F-LABEL: merge_8f64_f64_76543210:
+; X86-AVX512F: # %bb.0:
+; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; X86-AVX512F-NEXT: vpermpd (%eax), %zmm0, %zmm0
+; X86-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds double, ptr %ptr, i64 7
+ %ptr1 = getelementptr inbounds double, ptr %ptr, i64 6
+ %ptr2 = getelementptr inbounds double, ptr %ptr, i64 5
+ %ptr3 = getelementptr inbounds double, ptr %ptr, i64 4
+ %ptr4 = getelementptr inbounds double, ptr %ptr, i64 3
+ %ptr5 = getelementptr inbounds double, ptr %ptr, i64 2
+ %ptr6 = getelementptr inbounds double, ptr %ptr, i64 1
+ %ptr7 = getelementptr inbounds double, ptr %ptr, i64 0
+ %val0 = load double, ptr %ptr0
+ %val1 = load double, ptr %ptr1
+ %val2 = load double, ptr %ptr2
+ %val3 = load double, ptr %ptr3
+ %val4 = load double, ptr %ptr4
+ %val5 = load double, ptr %ptr5
+ %val6 = load double, ptr %ptr6
+ %val7 = load double, ptr %ptr7
+ %res0 = insertelement <8 x double> poison, double %val0, i64 0
+ %res1 = insertelement <8 x double> %res0, double %val1, i64 1
+ %res2 = insertelement <8 x double> %res1, double %val2, i64 2
+ %res3 = insertelement <8 x double> %res2, double %val3, i64 3
+ %res4 = insertelement <8 x double> %res3, double %val4, i64 4
+ %res5 = insertelement <8 x double> %res4, double %val5, i64 5
+ %res6 = insertelement <8 x double> %res5, double %val6, i64 6
+ %res7 = insertelement <8 x double> %res6, double %val7, i64 7
+ ret <8 x double> %res7
+}
+
define <8 x i64> @merge_8i64_4i64_z3(ptr %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_8i64_4i64_z3:
; ALL: # %bb.0:
@@ -227,6 +267,63 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(ptr %ptr) nounwind uwtable noinline ss
ret <8 x i64> %res7
}
+define <8 x i64> @merge_8i64_i64_76543210(ptr %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_8i64_i64_76543210:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovsxbq {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; ALL-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; ALL-NEXT: retq
+;
+; X86-AVX512F-LABEL: merge_8i64_i64_76543210:
+; X86-AVX512F: # %bb.0:
+; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512F-NEXT: vpinsrd $1, 12(%eax), %xmm0, %xmm0
+; X86-AVX512F-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
+; X86-AVX512F-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
+; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX512F-NEXT: vpinsrd $1, 28(%eax), %xmm1, %xmm1
+; X86-AVX512F-NEXT: vpinsrd $2, 16(%eax), %xmm1, %xmm1
+; X86-AVX512F-NEXT: vpinsrd $3, 20(%eax), %xmm1, %xmm1
+; X86-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX512F-NEXT: vpinsrd $1, 44(%eax), %xmm1, %xmm1
+; X86-AVX512F-NEXT: vpinsrd $2, 32(%eax), %xmm1, %xmm1
+; X86-AVX512F-NEXT: vpinsrd $3, 36(%eax), %xmm1, %xmm1
+; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-AVX512F-NEXT: vpinsrd $1, 60(%eax), %xmm2, %xmm2
+; X86-AVX512F-NEXT: vpinsrd $2, 48(%eax), %xmm2, %xmm2
+; X86-AVX512F-NEXT: vpinsrd $3, 52(%eax), %xmm2, %xmm2
+; X86-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; X86-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i64, ptr %ptr, i64 7
+ %ptr1 = getelementptr inbounds i64, ptr %ptr, i64 6
+ %ptr2 = getelementptr inbounds i64, ptr %ptr, i64 5
+ %ptr3 = getelementptr inbounds i64, ptr %ptr, i64 4
+ %ptr4 = getelementptr inbounds i64, ptr %ptr, i64 3
+ %ptr5 = getelementptr inbounds i64, ptr %ptr, i64 2
+ %ptr6 = getelementptr inbounds i64, ptr %ptr, i64 1
+ %ptr7 = getelementptr inbounds i64, ptr %ptr, i64 0
+ %val0 = load i64, ptr %ptr0
+ %val1 = load i64, ptr %ptr1
+ %val2 = load i64, ptr %ptr2
+ %val3 = load i64, ptr %ptr3
+ %val4 = load i64, ptr %ptr4
+ %val5 = load i64, ptr %ptr5
+ %val6 = load i64, ptr %ptr6
+ %val7 = load i64, ptr %ptr7
+ %res0 = insertelement <8 x i64> poison, i64 %val0, i64 0
+ %res1 = insertelement <8 x i64> %res0, i64 %val1, i64 1
+ %res2 = insertelement <8 x i64> %res1, i64 %val2, i64 2
+ %res3 = insertelement <8 x i64> %res2, i64 %val3, i64 3
+ %res4 = insertelement <8 x i64> %res3, i64 %val4, i64 4
+ %res5 = insertelement <8 x i64> %res4, i64 %val5, i64 5
+ %res6 = insertelement <8 x i64> %res5, i64 %val6, i64 6
+ %res7 = insertelement <8 x i64> %res6, i64 %val7, i64 7
+ ret <8 x i64> %res7
+}
+
define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
; ALL: # %bb.0:
@@ -335,6 +432,70 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable
ret <16 x float> %resF
}
+define <16 x float> @merge_16f32_f32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16f32_f32_FEDCBA9876543210:
+; ALL: # %bb.0:
+; ALL-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
+; ALL-NEXT: retq
+;
+; X86-AVX512F-LABEL: merge_16f32_f32_FEDCBA9876543210:
+; X86-AVX512F: # %bb.0:
+; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512F-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; X86-AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
+; X86-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds float, ptr %ptr, i64 15
+ %ptr1 = getelementptr inbounds float, ptr %ptr, i64 14
+ %ptr2 = getelementptr inbounds float, ptr %ptr, i64 13
+ %ptr3 = getelementptr inbounds float, ptr %ptr, i64 12
+ %ptr4 = getelementptr inbounds float, ptr %ptr, i64 11
+ %ptr5 = getelementptr inbounds float, ptr %ptr, i64 10
+ %ptr6 = getelementptr inbounds float, ptr %ptr, i64 9
+ %ptr7 = getelementptr inbounds float, ptr %ptr, i64 8
+ %ptr8 = getelementptr inbounds float, ptr %ptr, i64 7
+ %ptr9 = getelementptr inbounds float, ptr %ptr, i64 6
+ %ptrA = getelementptr inbounds float, ptr %ptr, i64 5
+ %ptrB = getelementptr inbounds float, ptr %ptr, i64 4
+ %ptrC = getelementptr inbounds float, ptr %ptr, i64 3
+ %ptrD = getelementptr inbounds float, ptr %ptr, i64 2
+ %ptrE = getelementptr inbounds float, ptr %ptr, i64 1
+ %ptrF = getelementptr inbounds float, ptr %ptr, i64 0
+ %val0 = load float, ptr %ptr0
+ %val1 = load float, ptr %ptr1
+ %val2 = load float, ptr %ptr2
+ %val3 = load float, ptr %ptr3
+ %val4 = load float, ptr %ptr4
+ %val5 = load float, ptr %ptr5
+ %val6 = load float, ptr %ptr6
+ %val7 = load float, ptr %ptr7
+ %val8 = load float, ptr %ptr8
+ %val9 = load float, ptr %ptr9
+ %valA = load float, ptr %ptrA
+ %valB = load float, ptr %ptrB
+ %valC = load float, ptr %ptrC
+ %valD = load float, ptr %ptrD
+ %valE = load float, ptr %ptrE
+ %valF = load float, ptr %ptrF
+ %res0 = insertelement <16 x float> poison, float %val0, i64 0
+ %res1 = insertelement <16 x float> %res0, float %val1, i64 1
+ %res2 = insertelement <16 x float> %res1, float %val2, i64 2
+ %res3 = insertelement <16 x float> %res2, float %val3, i64 3
+ %res4 = insertelement <16 x float> %res3, float %val4, i64 4
+ %res5 = insertelement <16 x float> %res4, float %val5, i64 5
+ %res6 = insertelement <16 x float> %res5, float %val6, i64 6
+ %res7 = insertelement <16 x float> %res6, float %val7, i64 7
+ %res8 = insertelement <16 x float> %res7, float %val8, i64 8
+ %res9 = insertelement <16 x float> %res8, float %val9, i64 9
+ %resA = insertelement <16 x float> %res9, float %valA, i64 10
+ %resB = insertelement <16 x float> %resA, float %valB, i64 11
+ %resC = insertelement <16 x float> %resB, float %valC, i64 12
+ %resD = insertelement <16 x float> %resC, float %valD, i64 13
+ %resE = insertelement <16 x float> %resD, float %valE, i64 14
+ %resF = insertelement <16 x float> %resE, float %valF, i64 15
+ ret <16 x float> %resF
+}
+
define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(ptr %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz:
; ALL: # %bb.0:
@@ -443,6 +604,70 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(ptr %ptr) nounwind uwtable n
ret <16 x i32> %resF
}
+define <16 x i32> @merge_16i32_i32_FEDCBA9876543210(ptr %ptr) nounwind uwtable noinline ssp {
+; ALL-LABEL: merge_16i32_i32_FEDCBA9876543210:
+; ALL: # %bb.0:
+; ALL-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; ALL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
+; ALL-NEXT: retq
+;
+; X86-AVX512F-LABEL: merge_16i32_i32_FEDCBA9876543210:
+; X86-AVX512F: # %bb.0:
+; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512F-NEXT: vpshufd {{.*#+}} zmm0 = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
+; X86-AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,2,3,0,1]
+; X86-AVX512F-NEXT: retl
+ %ptr0 = getelementptr inbounds i32, ptr %ptr, i64 15
+ %ptr1 = getelementptr inbounds i32, ptr %ptr, i64 14
+ %ptr2 = getelementptr inbounds i32, ptr %ptr, i64 13
+ %ptr3 = getelementptr inbounds i32, ptr %ptr, i64 12
+ %ptr4 = getelementptr inbounds i32, ptr %ptr, i64 11
+ %ptr5 = getelementptr inbounds i32, ptr %ptr, i64 10
+ %ptr6 = getelementptr inbounds i32, ptr %ptr, i64 9
+ %ptr7 = getelementptr inbounds i32, ptr %ptr, i64 8
+ %ptr8 = getelementptr inbounds i32, ptr %ptr, i64 7
+ %ptr9 = getelementptr inbounds i32, ptr %ptr, i64 6
+ %ptrA = getelementptr inbounds i32, ptr %ptr, i64 5
+ %ptrB = getelementptr inbounds i32, ptr %ptr, i64 4
+ %ptrC = getelementptr inbounds i32, ptr %ptr, i64 3
+ %ptrD = getelementptr inbounds i32, ptr %ptr, i64 2
+ %ptrE = getelementptr inbounds i32, ptr %ptr, i64 1
+ %ptrF = getelementptr inbounds i32, ptr %ptr, i64 0
+ %val0 = load i32, ptr %ptr0
+ %val1 = load i32, ptr %ptr1
+ %val2 = load i32, ptr %ptr2
+ %val3 = load i32, ptr %ptr3
+ %val4 = load i32, ptr %ptr4
+ %val5 = load i32, ptr %ptr5
+ %val6 = load i32, ptr %ptr6
+ %val7 = load i32, ptr %ptr7
+ %val8 = load i32, ptr %ptr8
+ %val9 = load i32, ptr %ptr9
+ %valA = load i32, ptr %ptrA
+ %valB = load i32, ptr %ptrB
+ %valC = load i32, ptr %ptrC
+ %valD = load i32, ptr %ptrD
+ %valE = load i32, ptr %ptrE
+ %valF = load i32, ptr %ptrF
+ %res0 = insertelement <16 x i32> poison, i32 %val0, i64 0
+ %res1 = insertelement <16 x i32> %res0, i32 %val1, i64 1
+ %res2 = insertelement <16 x i32> %res1, i32 %val2, i64 2
+ %res3 = insertelement <16 x i32> %res2, i32 %val3, i64 3
+ %res4 = insertelement <16 x i32> %res3, i32 %val4, i64 4
+ %res5 = insertelement <16 x i32> %res4, i32 %val5, i64 5
+ %res6 = insertelement <16 x i32> %res5, i32 %val6, i64 6
+ %res7 = insertelement <16 x i32> %res6, i32 %val7, i64 7
+ %res8 = insertelement <16 x i32> %res7, i32 %val8, i64 8
+ %res9 = insertelement <16 x i32> %res8, i32 %val9, i64 9
+ %resA = insertelement <16 x i32> %res9, i32 %valA, i64 10
+ %resB = insertelement <16 x i32> %resA, i32 %valB, i64 11
+ %resC = insertelement <16 x i32> %resB, i32 %valC, i64 12
+ %resD = insertelement <16 x i32> %resC, i32 %valD, i64 13
+ %resE = insertelement <16 x i32> %resD, i32 %valE, i64 14
+ %resF = insertelement <16 x i32> %resE, i32 %valF, i64 15
+ ret <16 x i32> %resF
+}
+
define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(ptr %ptr) nounwind uwtable noinline ssp {
; ALL-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz:
; ALL: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index a798f4c..541ca9d 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -2368,17 +2368,15 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; SSE41-NEXT: psubb %xmm3, %xmm1
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm3, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pmaddubsw %xmm4, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm5
-; SSE41-NEXT: paddb %xmm5, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm3
+; SSE41-NEXT: paddb %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: vec128_i8_signed_reg_reg:
@@ -2390,14 +2388,13 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -2429,12 +2426,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -2447,12 +2442,10 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -2591,17 +2584,15 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; SSE41-NEXT: psubb %xmm2, %xmm1
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm5
-; SSE41-NEXT: pand %xmm2, %xmm5
-; SSE41-NEXT: pandn %xmm4, %xmm2
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pmullw %xmm4, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: pandn %xmm4, %xmm3
+; SSE41-NEXT: pmaddubsw %xmm3, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm1, %xmm5
-; SSE41-NEXT: paddb %xmm5, %xmm0
+; SSE41-NEXT: por %xmm1, %xmm2
+; SSE41-NEXT: paddb %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: vec128_i8_unsigned_reg_reg:
@@ -2615,14 +2606,13 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpandn %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -2656,12 +2646,10 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -2674,12 +2662,10 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -2822,16 +2808,14 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; SSE41-NEXT: psubb %xmm3, %xmm0
; SSE41-NEXT: psrlw $1, %xmm0
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm3, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pmaddubsw %xmm4, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -2845,14 +2829,13 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -2886,12 +2869,10 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2],xmm3[2],xmm0[4],xmm3[4],xmm0[6],xmm3[6],xmm0[8],xmm3[8],xmm0[10],xmm3[10],xmm0[12],xmm3[12],xmm0[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -2905,12 +2886,10 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm0, %xmm0
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm0, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2],xmm4[2],xmm0[4],xmm4[4],xmm0[6],xmm4[6],xmm0[8],xmm4[8],xmm0[10],xmm4[10],xmm0[12],xmm4[12],xmm0[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2],xmm3[2],xmm0[4],xmm3[4],xmm0[6],xmm3[6],xmm0[8],xmm3[8],xmm0[10],xmm3[10],xmm0[12],xmm3[12],xmm0[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -3053,16 +3032,14 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; SSE41-NEXT: psubb %xmm3, %xmm1
; SSE41-NEXT: psrlw $1, %xmm1
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm3, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pmaddubsw %xmm4, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: por %xmm3, %xmm1
; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -3076,14 +3053,13 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -3117,12 +3093,10 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -3136,12 +3110,10 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
@@ -3286,16 +3258,14 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; SSE41-NEXT: psubb %xmm3, %xmm0
; SSE41-NEXT: psrlw $1, %xmm0
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pand %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm3, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pandn %xmm2, %xmm4
+; SSE41-NEXT: pmaddubsw %xmm4, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: paddb %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -3310,14 +3280,13 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpandn %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
@@ -3353,12 +3322,10 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-FALLBACK-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOP-FALLBACK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOP-FALLBACK-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOP-FALLBACK-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOP-FALLBACK-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOP-FALLBACK-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOP-FALLBACK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOP-FALLBACK-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOP-FALLBACK-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOP-FALLBACK-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOP-FALLBACK-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOP-FALLBACK-NEXT: retq
;
@@ -3373,12 +3340,10 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOPAVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
; XOPAVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
; XOPAVX1-NEXT: vpshlb %xmm3, %xmm1, %xmm1
-; XOPAVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; XOPAVX1-NEXT: vpandn %xmm2, %xmm3, %xmm4
-; XOPAVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm4
-; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpmaddubsw %xmm2, %xmm1, %xmm1
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2],xmm4[2],xmm1[4],xmm4[4],xmm1[6],xmm4[6],xmm1[8],xmm4[8],xmm1[10],xmm4[10],xmm1[12],xmm4[12],xmm1[14],xmm4[14]
+; XOPAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; XOPAVX1-NEXT: vpmaddubsw %xmm3, %xmm1, %xmm3
+; XOPAVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[2],xmm3[2],xmm1[4],xmm3[4],xmm1[6],xmm3[6],xmm1[8],xmm3[8],xmm1[10],xmm3[10],xmm1[12],xmm3[12],xmm1[14],xmm3[14]
; XOPAVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 7c9adaf..85791cd 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1896,40 +1896,38 @@ define <16 x i16> @vec256_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwind {
; AVX1-LABEL: vec256_i8_signed_reg_reg:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5
; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6
; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6
-; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6
+; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpmullw %xmm5, %xmm1, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
+; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -1943,14 +1941,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -1974,15 +1971,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
+; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
+; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
@@ -1998,14 +1993,13 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
@@ -2087,19 +2081,17 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm7, %xmm1
; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
@@ -2119,14 +2111,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX2-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2150,15 +2141,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
+; XOP-NEXT: vpmullw %xmm5, %xmm1, %xmm1
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm1, %xmm1
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm2, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
+; XOP-NEXT: vpmullw %xmm4, %xmm2, %xmm2
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm2, %xmm2
; XOP-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0
@@ -2175,14 +2164,13 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm2, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm2
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
@@ -2247,41 +2235,39 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind {
; AVX1-LABEL: vec256_i8_signed_mem_reg:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm5
; AVX1-NEXT: vpminsb %xmm0, %xmm1, %xmm6
; AVX1-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0
-; AVX1-NEXT: vpminsb %xmm3, %xmm2, %xmm6
-; AVX1-NEXT: vpmaxsb %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpsubb %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
+; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6
+; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
+; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm8, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
-; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
+; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -2296,14 +2282,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -2328,15 +2313,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1
+; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0
@@ -2353,14 +2336,13 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX512F-NEXT: vpsubb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm4
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
@@ -2443,19 +2425,17 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5
+; AVX1-NEXT: vpmullw %xmm5, %xmm2, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2
+; AVX1-NEXT: vpor %xmm2, %xmm7, %xmm2
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
; AVX1-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3
@@ -2474,14 +2454,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2506,15 +2485,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm2, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm2
+; XOP-NEXT: vpmullw %xmm5, %xmm2, %xmm2
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm2, %xmm2
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm3, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm3, %xmm3
+; XOP-NEXT: vpmullw %xmm4, %xmm3, %xmm3
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm3, %xmm3
; XOP-NEXT: vpaddb %xmm1, %xmm3, %xmm1
; XOP-NEXT: vpaddb %xmm0, %xmm2, %xmm0
@@ -2531,14 +2508,13 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
@@ -2603,44 +2579,42 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; AVX1-LABEL: vec256_i8_signed_mem_mem:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rsi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rsi), %xmm2
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa (%rsi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
+; AVX1-NEXT: vmovdqa (%rdi), %xmm2
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX1-NEXT: vpcmpgtb %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm6
-; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm5
+; AVX1-NEXT: vpminsb %xmm0, %xmm2, %xmm6
+; AVX1-NEXT: vpmaxsb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsubb %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpminsb %xmm1, %xmm3, %xmm6
+; AVX1-NEXT: vpmaxsb %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpsubb %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm6
-; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsubb %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
; AVX1-NEXT: vpand %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX1-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm8
-; AVX1-NEXT: vpmaddubsw %xmm8, %xmm1, %xmm8
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpandn %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm8, %xmm1
+; AVX1-NEXT: vpmullw %xmm5, %xmm0, %xmm7
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpandn %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm0
; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm5
-; AVX1-NEXT: vpmaddubsw %xmm5, %xmm2, %xmm5
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpandn %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpmaddubsw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpaddb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpmullw %xmm4, %xmm1, %xmm5
+; AVX1-NEXT: vpand %xmm5, %xmm8, %xmm5
+; AVX1-NEXT: vpandn %xmm4, %xmm8, %xmm4
+; AVX1-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: vec256_i8_signed_mem_mem:
@@ -2654,14 +2628,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX2-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX2-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX2-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -2687,15 +2660,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; XOP-NEXT: vbroadcastss {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; XOP-NEXT: vpandn %xmm5, %xmm7, %xmm8
; XOP-NEXT: vpmaddubsw %xmm8, %xmm0, %xmm8
-; XOP-NEXT: vpand %xmm7, %xmm5, %xmm5
-; XOP-NEXT: vpmaddubsw %xmm5, %xmm0, %xmm0
+; XOP-NEXT: vpmullw %xmm5, %xmm0, %xmm0
; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30]
; XOP-NEXT: vpperm %xmm5, %xmm8, %xmm0, %xmm0
; XOP-NEXT: vpor %xmm6, %xmm4, %xmm4
; XOP-NEXT: vpandn %xmm4, %xmm7, %xmm6
; XOP-NEXT: vpmaddubsw %xmm6, %xmm1, %xmm6
-; XOP-NEXT: vpand %xmm7, %xmm4, %xmm4
-; XOP-NEXT: vpmaddubsw %xmm4, %xmm1, %xmm1
+; XOP-NEXT: vpmullw %xmm4, %xmm1, %xmm1
; XOP-NEXT: vpperm %xmm5, %xmm6, %xmm1, %xmm1
; XOP-NEXT: vpaddb %xmm3, %xmm1, %xmm1
; XOP-NEXT: vpaddb %xmm2, %xmm0, %xmm0
@@ -2713,14 +2684,13 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm1, %ymm4
-; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm4
-; AVX512F-NEXT: vpandn %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512F-NEXT: vpandn %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/midpoint-int.ll b/llvm/test/CodeGen/X86/midpoint-int.ll
index a75d42e..c058e37 100644
--- a/llvm/test/CodeGen/X86/midpoint-int.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int.ll
@@ -658,9 +658,9 @@ define i16 @scalar_i16_signed_reg_reg(i16 %a1, i16 %a2) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: subw %dx, %ax
; X86-NEXT: setle %bl
; X86-NEXT: leal -1(%ebx,%ebx), %edx
@@ -710,9 +710,9 @@ define i16 @scalar_i16_unsigned_reg_reg(i16 %a1, i16 %a2) nounwind {
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: subw %dx, %ax
; X86-NEXT: setbe %bl
; X86-NEXT: leal -1(%ebx,%ebx), %edx
@@ -765,9 +765,9 @@ define i16 @scalar_i16_signed_mem_reg(ptr %a1_addr, i16 %a2) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %ecx
+; X86-NEXT: movzwl (%eax), %eax
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: subw %dx, %ax
; X86-NEXT: setle %bl
; X86-NEXT: leal -1(%ebx,%ebx), %edx
@@ -817,11 +817,11 @@ define i16 @scalar_i16_signed_reg_mem(i16 %a1, ptr %a2_addr) nounwind {
; X86-LABEL: scalar_i16_signed_reg_mem:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
-; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzwl (%eax), %edx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl (%ecx), %edx
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: subw %dx, %ax
; X86-NEXT: setle %bl
; X86-NEXT: leal -1(%ebx,%ebx), %edx
@@ -871,12 +871,12 @@ define i16 @scalar_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; X86-LABEL: scalar_i16_signed_mem_mem:
; X86: # %bb.0:
; X86-NEXT: pushl %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movzwl (%ecx), %ecx
-; X86-NEXT: movzwl (%eax), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movzwl (%eax), %eax
+; X86-NEXT: movzwl (%ecx), %edx
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: subw %dx, %ax
; X86-NEXT: setle %bl
; X86-NEXT: leal -1(%ebx,%ebx), %edx
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 04f0a65..aa2dd00 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -889,19 +889,17 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-SKX-NOVBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-SKX-NOVBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm3, %ymm5
-; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
-; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4
+; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3
; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm1, %ymm1
-; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
-; CHECK-SKX-NOVBMI-NEXT: vpand %ymm4, %ymm2, %ymm3
-; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm5)
+; CHECK-SKX-NOVBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm3
+; CHECK-SKX-NOVBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2
; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %ymm0, %ymm0
-; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm5)
; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm0, (%rdx)
; CHECK-SKX-NOVBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
; CHECK-SKX-NOVBMI-NEXT: vzeroupper
@@ -913,20 +911,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
-; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
-; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
+; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4
+; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
-; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
-; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
-; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
-; CHECK-SKX-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2
+; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm1, %ymm3, %ymm4
+; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm1
+; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
-; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
-; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
-; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
+; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm0, %ymm3, %ymm1
+; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, (%rdx)
+; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm4, 32(%rdx)
; CHECK-SKX-VBMI-NEXT: vzeroupper
; CHECK-SKX-VBMI-NEXT: retq
;
@@ -936,19 +932,17 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-AVX512-NEXT: vpand %ymm4, %ymm3, %ymm5
-; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
-; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm4
+; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm5, %ymm3
; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1
-; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
-; CHECK-AVX512-NEXT: vpand %ymm4, %ymm2, %ymm3
-; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm4 & ymm5)
+; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm3
+; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm5, %ymm2
; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0
-; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
+; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm5)
; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx)
; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx)
; CHECK-AVX512-NEXT: vzeroupper
@@ -960,20 +954,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
-; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
-; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
-; CHECK-VBMI-NEXT: vpand %ymm4, %ymm3, %ymm3
+; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm4
+; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm5, %ymm3
; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
-; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
-; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
-; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
-; CHECK-VBMI-NEXT: vpand %ymm4, %ymm2, %ymm2
+; CHECK-VBMI-NEXT: vpermt2b %ymm1, %ymm3, %ymm4
+; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm1
+; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm5, %ymm2
; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
-; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
-; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
-; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
+; CHECK-VBMI-NEXT: vpermt2b %ymm0, %ymm3, %ymm1
+; CHECK-VBMI-NEXT: vmovdqa %ymm1, (%rdx)
+; CHECK-VBMI-NEXT: vmovdqa %ymm4, 32(%rdx)
; CHECK-VBMI-NEXT: vzeroupper
; CHECK-VBMI-NEXT: retq
%d = load <64 x i8>, ptr %a
@@ -988,13 +980,12 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-NOVBMI: # %bb.0:
; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SKX-NOVBMI-NEXT: vpandq %zmm2, %zmm1, %zmm3
-; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm1
+; CHECK-SKX-NOVBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; CHECK-SKX-NOVBMI-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-SKX-NOVBMI-NEXT: vpandnq %zmm1, %zmm3, %zmm1
; CHECK-SKX-NOVBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; CHECK-SKX-NOVBMI-NEXT: vpsllw $8, %zmm0, %zmm0
-; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; CHECK-SKX-NOVBMI-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3)
; CHECK-SKX-NOVBMI-NEXT: vmovdqa64 %zmm0, (%rdx)
; CHECK-SKX-NOVBMI-NEXT: vzeroupper
; CHECK-SKX-NOVBMI-NEXT: retq
@@ -1003,13 +994,11 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-SKX-VBMI: # %bb.0:
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-SKX-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
-; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; CHECK-SKX-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; CHECK-SKX-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; CHECK-SKX-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
-; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
+; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm0, %zmm2, %zmm1
; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
; CHECK-SKX-VBMI-NEXT: vzeroupper
; CHECK-SKX-VBMI-NEXT: retq
@@ -1018,13 +1007,12 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-AVX512: # %bb.0:
; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm3
-; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1
+; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; CHECK-AVX512-NEXT: vpbroadcastd {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm3, %zmm1
; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0
-; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; CHECK-AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3)
; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
; CHECK-AVX512-NEXT: vzeroupper
; CHECK-AVX512-NEXT: retq
@@ -1033,13 +1021,11 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
; CHECK-VBMI: # %bb.0:
; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
-; CHECK-VBMI-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
-; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; CHECK-VBMI-NEXT: vpandq %zmm2, %zmm1, %zmm1
+; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; CHECK-VBMI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
-; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
+; CHECK-VBMI-NEXT: vpermi2b %zmm0, %zmm2, %zmm1
; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
; CHECK-VBMI-NEXT: vzeroupper
; CHECK-VBMI-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll
index 73d459b..8f97d26 100644
--- a/llvm/test/CodeGen/X86/mmx-arith.ll
+++ b/llvm/test/CodeGen/X86/mmx-arith.ll
@@ -403,11 +403,11 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: testl %ecx, %ecx
; X86-NEXT: je .LBB3_1
; X86-NEXT: # %bb.2: # %bb26.preheader
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: xorl %eax, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB3_3: # %bb26
@@ -427,7 +427,6 @@ define <1 x i64> @test3(ptr %a, ptr %b, i32 %count) nounwind {
; X86-NEXT: jb .LBB3_3
; X86-NEXT: jmp .LBB3_4
; X86-NEXT: .LBB3_1:
-; X86-NEXT: xorl %eax, %eax
; X86-NEXT: xorl %edx, %edx
; X86-NEXT: .LBB3_4: # %bb31
; X86-NEXT: popl %esi
diff --git a/llvm/test/CodeGen/X86/mul-constant-i16.ll b/llvm/test/CodeGen/X86/mul-constant-i16.ll
index b1aa789..a663f6a 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i16.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i16.ll
@@ -715,8 +715,8 @@ define i16 @test_mul_by_66(i16 %x) {
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $6, %eax
-; X64-NEXT: leal (%rax,%rdi,2), %eax
+; X64-NEXT: shll $6, %edi
+; X64-NEXT: leal (%rdi,%rax,2), %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 66
@@ -757,8 +757,8 @@ define i16 @test_mul_by_520(i16 %x) {
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $9, %eax
-; X64-NEXT: leal (%rax,%rdi,8), %eax
+; X64-NEXT: shll $9, %edi
+; X64-NEXT: leal (%rdi,%rax,8), %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
%mul = mul nsw i16 %x, 520
diff --git a/llvm/test/CodeGen/X86/mul-constant-i32.ll b/llvm/test/CodeGen/X86/mul-constant-i32.ll
index 79889b9..4129b44 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i32.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i32.ll
@@ -1155,16 +1155,16 @@ define i32 @test_mul_by_66(i32 %x) {
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: movl %edi, %eax
-; X64-HSW-NEXT: shll $6, %eax
-; X64-HSW-NEXT: leal (%rax,%rdi,2), %eax
+; X64-HSW-NEXT: shll $6, %edi
+; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax
; X64-HSW-NEXT: retq
;
; X64-JAG-LABEL: test_mul_by_66:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: movl %edi, %eax
-; X64-JAG-NEXT: shll $6, %eax
-; X64-JAG-NEXT: leal (%rax,%rdi,2), %eax
+; X64-JAG-NEXT: shll $6, %edi
+; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax
; X64-JAG-NEXT: retq
;
; X86-NOOPT-LABEL: test_mul_by_66:
@@ -1241,16 +1241,16 @@ define i32 @test_mul_by_520(i32 %x) {
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: movl %edi, %eax
-; X64-HSW-NEXT: shll $9, %eax
-; X64-HSW-NEXT: leal (%rax,%rdi,8), %eax
+; X64-HSW-NEXT: shll $9, %edi
+; X64-HSW-NEXT: leal (%rdi,%rax,8), %eax
; X64-HSW-NEXT: retq
;
; X64-JAG-LABEL: test_mul_by_520:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: movl %edi, %eax
-; X64-JAG-NEXT: shll $9, %eax
-; X64-JAG-NEXT: leal (%rax,%rdi,8), %eax
+; X64-JAG-NEXT: shll $9, %edi
+; X64-JAG-NEXT: leal (%rdi,%rax,8), %eax
; X64-JAG-NEXT: retq
;
; X86-NOOPT-LABEL: test_mul_by_520:
diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll
index a4fa1ee..b488653 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i8.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll
@@ -425,8 +425,8 @@ define i8 @test_mul_by_66(i8 %x) {
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shll $6, %eax
-; X64-NEXT: leal (%rax,%rdi,2), %eax
+; X64-NEXT: shll $6, %edi
+; X64-NEXT: leal (%rdi,%rax,2), %eax
; X64-NEXT: # kill: def $al killed $al killed $eax
; X64-NEXT: retq
%m = mul i8 %x, 66
diff --git a/llvm/test/CodeGen/X86/narrow-add-i64.ll b/llvm/test/CodeGen/X86/narrow-add-i64.ll
new file mode 100644
index 0000000..a7a54fd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/narrow-add-i64.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+define i64 @test_add_i64_i16_const(i16 %a) nounwind {
+; X86-LABEL: test_add_i64_i16_const:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl $42, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: test_add_i64_i16_const:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: addq $42, %rax
+; X64-NEXT: retq
+ %zext_a = zext i16 %a to i64
+ %sum = add nuw nsw i64 %zext_a, 42
+ ret i64 %sum
+}
+
+; TODO: First 48 bits are all zeros so we can safely truncate to 32 bit additon
+define i64 @test_add_i64_i16_zext(i16 %a, i16 %b) nounwind {
+; X86-LABEL: test_add_i64_i16_zext:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: test_add_i64_i16_zext:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %ecx
+; X64-NEXT: movzwl %si, %eax
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: retq
+ %zext_a = zext i16 %a to i64
+ %zext_b = zext i16 %b to i64
+ %sum = add nuw nsw i64 %zext_a, %zext_b
+ ret i64 %sum
+}
+
+; Negative: Set the 32nd bit of a to force 64 bit addition, we do not truncate to 32 bit addition in this case
+define i64 @negative_test_add_i64_i16(i16 %a) nounwind {
+; X86-LABEL: negative_test_add_i64_i16:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl $42, %eax
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: negative_test_add_i64_i16:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %ecx
+; X64-NEXT: movabsq $4294967338, %rax # imm = 0x10000002A
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: retq
+ %zext_a = zext i16 %a to i64
+ %or_a = or i64 %zext_a, 4294967296
+ %sum = add nuw nsw i64 %or_a, 42
+ ret i64 %sum
+}
+
+; Negative: We don't truncate to 32 bit addition in case of sign extension
+define i64 @negative_test_add_i64_i16_sext(i16 %a, i16 %b) nounwind {
+; X86-LABEL: negative_test_add_i64_i16_sext:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: negative_test_add_i64_i16_sext:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: movswq %di, %rcx
+; X64-NEXT: movswq %si, %rax
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: retq
+ %sext_a = sext i16 %a to i64
+ %sext_b = sext i16 %b to i64
+ %sum = add nuw nsw i64 %sext_a, %sext_b
+ ret i64 %sum
+}
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 4b0f75d..ac45541 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -679,39 +679,39 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
; SSE2-NEXT: packuswb %xmm4, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm1, %xmm5
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pandn %xmm3, %xmm6
-; SSE2-NEXT: por %xmm5, %xmm6
-; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; SSE2-NEXT: packuswb %xmm5, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT: movq %xmm4, (%rsi)
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm5, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: movq %xmm4, (%rdx)
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: pand %xmm0, %xmm4
; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm4, (%rsi)
-; SSE2-NEXT: movq %xmm5, (%rdx)
; SSE2-NEXT: movq %xmm0, (%rcx)
; SSE2-NEXT: retq
;
@@ -724,16 +724,16 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE42-NEXT: movdqa %xmm0, %xmm3
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,3,6,9,12,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; SSE42-NEXT: por %xmm2, %xmm3
+; SSE42-NEXT: movq %xmm3, (%rsi)
; SSE42-NEXT: movdqa %xmm1, %xmm2
; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm2[0,3,6,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: movdqa %xmm0, %xmm4
-; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,4,7,10,13],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u]
-; SSE42-NEXT: por %xmm2, %xmm4
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[1,4,7,10,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; SSE42-NEXT: por %xmm2, %xmm3
+; SSE42-NEXT: movq %xmm3, (%rdx)
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; SSE42-NEXT: por %xmm1, %xmm0
-; SSE42-NEXT: movq %xmm3, (%rsi)
-; SSE42-NEXT: movq %xmm4, (%rdx)
; SSE42-NEXT: movq %xmm0, (%rcx)
; SSE42-NEXT: retq
;
@@ -744,14 +744,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vmovq %xmm2, (%rsi)
+; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX1-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vmovq %xmm2, (%rdx)
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm2, (%rsi)
-; AVX1-NEXT: vmovq %xmm3, (%rdx)
; AVX1-NEXT: vmovq %xmm0, (%rcx)
; AVX1-NEXT: retq
;
@@ -762,14 +762,14 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%rdx)
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-NEXT: vmovq %xmm3, (%rdx)
; AVX2-NEXT: vmovq %xmm0, (%rcx)
; AVX2-NEXT: retq
;
@@ -778,10 +778,10 @@ define void @interleave_24i8_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; XOP-NEXT: vmovdqu (%rdi), %xmm1
; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[0,3,6,9,12,15],xmm0[2,5],xmm1[u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u]
; XOP-NEXT: vmovq %xmm2, (%rsi)
-; XOP-NEXT: vmovq %xmm3, (%rdx)
+; XOP-NEXT: vpperm {{.*#+}} xmm2 = xmm1[1,4,7,10,13],xmm0[0,3,6],xmm1[u,u,u,u,u,u,u,u]
+; XOP-NEXT: vmovq %xmm2, (%rdx)
+; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[2,5,8,11,14],xmm0[1,4,7],xmm1[u,u,u,u,u,u,u,u]
; XOP-NEXT: vmovq %xmm0, (%rcx)
; XOP-NEXT: retq
%wide.vec = load <24 x i8>, ptr %p, align 4
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 81390e5..9f08658 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -13,9 +13,11 @@
; CHECK-LABEL: Pass Arguments:
; CHECK-NEXT: Target Library Information
+; CHECK-NEXT: Runtime Library Function Analysis
; CHECK-NEXT: Target Pass Configuration
; CHECK-NEXT: Machine Module Information
; CHECK-NEXT: Target Transform Information
+; CHECK-NEXT: Library Function Lowering Analysis
; CHECK-NEXT: Assumption Cache Tracker
; CHECK-NEXT: Type-Based Alias Analysis
; CHECK-NEXT: Scoped NoAlias Alias Analysis
@@ -208,8 +210,6 @@
; CHECK-NEXT: X86 Fixup Inst Tuning
; CHECK-NEXT: X86 Fixup Vector Constants
; CHECK-NEXT: Compressing EVEX instrs when possible
-; CHECK-NEXT: X86 Discriminate Memory Operands
-; CHECK-NEXT: X86 Insert Cache Prefetches
; CHECK-NEXT: X86 insert wait instruction
; CHECK-NEXT: Contiguously Lay Out Funclets
; CHECK-NEXT: Remove Loads Into Fake Uses
diff --git a/llvm/test/CodeGen/X86/optimize-max-0.ll b/llvm/test/CodeGen/X86/optimize-max-0.ll
index 283c00e..b6af7e1 100644
--- a/llvm/test/CodeGen/X86/optimize-max-0.ll
+++ b/llvm/test/CodeGen/X86/optimize-max-0.ll
@@ -16,65 +16,65 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: subl $28, %esp
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebp
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: movl %edi, %ecx
-; CHECK-NEXT: imull %ebp, %ecx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: imull %esi, %eax
; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %ecx, (%esp) ## 4-byte Spill
+; CHECK-NEXT: movl %eax, (%esp) ## 4-byte Spill
; CHECK-NEXT: je LBB0_19
; CHECK-NEXT: ## %bb.1: ## %bb10.preheader
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: sarl $31, %eax
-; CHECK-NEXT: shrl $30, %eax
-; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: sarl $2, %eax
-; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: movl %eax, %ebp
+; CHECK-NEXT: sarl $31, %ebp
+; CHECK-NEXT: shrl $30, %ebp
+; CHECK-NEXT: addl %eax, %ebp
+; CHECK-NEXT: sarl $2, %ebp
+; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: jle LBB0_12
; CHECK-NEXT: ## %bb.2: ## %bb.nph9
-; CHECK-NEXT: testl %ebp, %ebp
+; CHECK-NEXT: testl %esi, %esi
; CHECK-NEXT: jle LBB0_12
; CHECK-NEXT: ## %bb.3: ## %bb.nph9.split
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: incl %eax
; CHECK-NEXT: xorl %ecx, %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: movl %edi, %edx
+; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: LBB0_4: ## %bb6
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movzbl (%eax,%esi,2), %ebx
-; CHECK-NEXT: movb %bl, (%edx,%esi)
-; CHECK-NEXT: incl %esi
-; CHECK-NEXT: cmpl %ebp, %esi
+; CHECK-NEXT: movzbl (%eax,%edi,2), %ebx
+; CHECK-NEXT: movb %bl, (%edx,%edi)
+; CHECK-NEXT: incl %edi
+; CHECK-NEXT: cmpl %esi, %edi
; CHECK-NEXT: jl LBB0_4
; CHECK-NEXT: ## %bb.5: ## %bb9
; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1
; CHECK-NEXT: incl %ecx
; CHECK-NEXT: addl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: addl %ebp, %edx
-; CHECK-NEXT: cmpl %edi, %ecx
+; CHECK-NEXT: addl %esi, %edx
+; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: je LBB0_12
; CHECK-NEXT: ## %bb.6: ## %bb7.preheader
; CHECK-NEXT: ## in Loop: Header=BB0_4 Depth=1
-; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: xorl %edi, %edi
; CHECK-NEXT: jmp LBB0_4
; CHECK-NEXT: LBB0_12: ## %bb18.loopexit
+; CHECK-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; CHECK-NEXT: movl (%esp), %eax ## 4-byte Reload
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx ## 4-byte Reload
-; CHECK-NEXT: addl %ecx, %eax
+; CHECK-NEXT: addl %ebp, %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT: cmpl $1, %edi
+; CHECK-NEXT: cmpl $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: jle LBB0_13
; CHECK-NEXT: ## %bb.7: ## %bb.nph5
-; CHECK-NEXT: cmpl $2, %ebp
+; CHECK-NEXT: cmpl $2, %esi
; CHECK-NEXT: jl LBB0_13
; CHECK-NEXT: ## %bb.8: ## %bb.nph5.split
-; CHECK-NEXT: movl %ebp, %edx
-; CHECK-NEXT: shrl $31, %edx
-; CHECK-NEXT: addl %ebp, %edx
-; CHECK-NEXT: sarl %edx
+; CHECK-NEXT: movl %esi, %ebp
+; CHECK-NEXT: shrl $31, %ebp
+; CHECK-NEXT: addl %esi, %ebp
+; CHECK-NEXT: sarl %ebp
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl %eax, %ecx
; CHECK-NEXT: shrl $31, %ecx
@@ -84,102 +84,103 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
; CHECK-NEXT: addl %ecx, %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
-; CHECK-NEXT: addl $2, %esi
-; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload
-; CHECK-NEXT: addl %esi, %ecx
-; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: addl $2, %edx
+; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload
+; CHECK-NEXT: addl %edx, %ecx
; CHECK-NEXT: xorl %edi, %edi
+; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: LBB0_9: ## %bb13
; CHECK-NEXT: ## =>This Loop Header: Depth=1
; CHECK-NEXT: ## Child Loop BB0_10 Depth 2
; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT: addl %esi, %edi
+; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT: addl %edx, %edi
; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edi
; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
-; CHECK-NEXT: xorl %esi, %esi
+; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: LBB0_10: ## %bb14
; CHECK-NEXT: ## Parent Loop BB0_9 Depth=1
; CHECK-NEXT: ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT: movzbl -2(%edi,%esi,4), %ebx
-; CHECK-NEXT: movb %bl, (%ecx,%esi)
-; CHECK-NEXT: movzbl (%edi,%esi,4), %ebx
-; CHECK-NEXT: movb %bl, (%eax,%esi)
-; CHECK-NEXT: incl %esi
-; CHECK-NEXT: cmpl %edx, %esi
+; CHECK-NEXT: movzbl -2(%edi,%ebx,4), %edx
+; CHECK-NEXT: movb %dl, (%ecx,%ebx)
+; CHECK-NEXT: movzbl (%edi,%ebx,4), %edx
+; CHECK-NEXT: movb %dl, (%eax,%ebx)
+; CHECK-NEXT: incl %ebx
+; CHECK-NEXT: cmpl %ebp, %ebx
; CHECK-NEXT: jl LBB0_10
; CHECK-NEXT: ## %bb.11: ## %bb17
; CHECK-NEXT: ## in Loop: Header=BB0_9 Depth=1
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Reload
; CHECK-NEXT: incl %edi
-; CHECK-NEXT: addl %edx, %eax
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
-; CHECK-NEXT: addl $2, %esi
-; CHECK-NEXT: addl %edx, %ecx
+; CHECK-NEXT: addl %ebp, %eax
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
+; CHECK-NEXT: addl $2, %edx
+; CHECK-NEXT: addl %ebp, %ecx
; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edi ## 4-byte Folded Reload
; CHECK-NEXT: jl LBB0_9
; CHECK-NEXT: LBB0_13: ## %bb20
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: cmpl $1, %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT: cmpl $1, %ecx
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ebx
; CHECK-NEXT: je LBB0_19
; CHECK-NEXT: ## %bb.14: ## %bb20
-; CHECK-NEXT: cmpl $3, %eax
+; CHECK-NEXT: cmpl $3, %ecx
; CHECK-NEXT: jne LBB0_24
; CHECK-NEXT: ## %bb.15: ## %bb22
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; CHECK-NEXT: addl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
-; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp ## 4-byte Reload
+; CHECK-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: jle LBB0_18
; CHECK-NEXT: ## %bb.16: ## %bb.nph
-; CHECK-NEXT: leal 15(%edi), %eax
+; CHECK-NEXT: leal 15(%edx), %eax
; CHECK-NEXT: andl $-16, %eax
; CHECK-NEXT: imull {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: addl %ebx, %ebx
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl (%esp), %esi ## 4-byte Reload
-; CHECK-NEXT: addl %esi, %ecx
-; CHECK-NEXT: addl %ecx, %ebx
-; CHECK-NEXT: addl %eax, %edx
-; CHECK-NEXT: leal 15(%ebp), %eax
+; CHECK-NEXT: addl %ebp, %ebp
+; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT: addl %edi, %ecx
+; CHECK-NEXT: addl %ecx, %ebp
+; CHECK-NEXT: addl %eax, %ebx
+; CHECK-NEXT: leal 15(%esi), %eax
; CHECK-NEXT: andl $-16, %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: LBB0_17: ## %bb23
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subl $4, %esp
-; CHECK-NEXT: pushl %ebp
-; CHECK-NEXT: pushl %edx
+; CHECK-NEXT: pushl %esi
; CHECK-NEXT: pushl %ebx
-; CHECK-NEXT: movl %ebx, %esi
+; CHECK-NEXT: pushl %ebp
+; CHECK-NEXT: movl %ebp, %edi
+; CHECK-NEXT: movl %ebx, %ebp
; CHECK-NEXT: movl %edx, %ebx
; CHECK-NEXT: calll _memcpy
; CHECK-NEXT: movl %ebx, %edx
-; CHECK-NEXT: movl %esi, %ebx
+; CHECK-NEXT: movl %ebp, %ebx
+; CHECK-NEXT: movl %edi, %ebp
; CHECK-NEXT: addl $16, %esp
-; CHECK-NEXT: addl %ebp, %ebx
-; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; CHECK-NEXT: decl %edi
+; CHECK-NEXT: addl %esi, %ebp
+; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT: decl %edx
; CHECK-NEXT: jne LBB0_17
; CHECK-NEXT: LBB0_18: ## %bb26
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax ## 4-byte Reload
-; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT: addl %edx, %eax
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: addl %eax, %ecx
+; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT: addl %ecx, %esi
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: addl %esi, %edx
; CHECK-NEXT: jmp LBB0_23
; CHECK-NEXT: LBB0_19: ## %bb29
-; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: jle LBB0_22
; CHECK-NEXT: ## %bb.20: ## %bb.nph11
-; CHECK-NEXT: movl %edi, %esi
-; CHECK-NEXT: leal 15(%ebp), %eax
+; CHECK-NEXT: leal 15(%esi), %eax
; CHECK-NEXT: andl $-16, %eax
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi
@@ -187,30 +188,32 @@ define void @foo(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
; CHECK-NEXT: LBB0_21: ## %bb30
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subl $4, %esp
-; CHECK-NEXT: pushl %ebp
-; CHECK-NEXT: pushl %edx
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: pushl %ebx
; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: movl %ebx, %ebp
; CHECK-NEXT: movl %edx, %ebx
; CHECK-NEXT: calll _memcpy
; CHECK-NEXT: movl %ebx, %edx
+; CHECK-NEXT: movl %ebp, %ebx
; CHECK-NEXT: addl $16, %esp
-; CHECK-NEXT: addl %ebp, %edi
-; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
-; CHECK-NEXT: decl %esi
+; CHECK-NEXT: addl %esi, %edi
+; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT: decl %edx
; CHECK-NEXT: jne LBB0_21
; CHECK-NEXT: LBB0_22: ## %bb33
-; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT: addl %edx, %ecx
+; CHECK-NEXT: movl (%esp), %ecx ## 4-byte Reload
+; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: addl %ecx, %edx
; CHECK-NEXT: LBB0_23: ## %bb33
-; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: shrl $31, %eax
-; CHECK-NEXT: addl %edx, %eax
+; CHECK-NEXT: addl %ecx, %eax
; CHECK-NEXT: sarl %eax
; CHECK-NEXT: subl $4, %esp
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: pushl $128
-; CHECK-NEXT: pushl %ecx
+; CHECK-NEXT: pushl %edx
; CHECK-NEXT: calll _memset
; CHECK-NEXT: addl $44, %esp
; CHECK-NEXT: LBB0_25: ## %return
@@ -523,38 +526,38 @@ define void @bar(ptr %r, i32 %s, i32 %w, i32 %x, ptr %j, i32 %d) nounwind {
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Reload
; CHECK-NEXT: addl %edx, %eax
; CHECK-NEXT: xorl %edx, %edx
-; CHECK-NEXT: xorl %ebx, %ebx
+; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: LBB1_9: ## %bb13
; CHECK-NEXT: ## =>This Loop Header: Depth=1
; CHECK-NEXT: ## Child Loop BB1_10 Depth 2
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
-; CHECK-NEXT: andl $1, %ebx
; CHECK-NEXT: movl %edx, (%esp) ## 4-byte Spill
-; CHECK-NEXT: addl %edx, %ebx
-; CHECK-NEXT: imull {{[0-9]+}}(%esp), %ebx
-; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT: andl $1, %edx
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) ## 4-byte Spill
+; CHECK-NEXT: addl %esi, %edx
+; CHECK-NEXT: imull {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; CHECK-NEXT: xorl %esi, %esi
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: LBB1_10: ## %bb14
; CHECK-NEXT: ## Parent Loop BB1_9 Depth=1
; CHECK-NEXT: ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT: movzbl -2(%ebx,%esi,4), %edx
-; CHECK-NEXT: movb %dl, (%eax,%esi)
-; CHECK-NEXT: movzbl (%ebx,%esi,4), %edx
-; CHECK-NEXT: movb %dl, (%ecx,%esi)
+; CHECK-NEXT: movzbl -2(%edx,%esi,4), %ebx
+; CHECK-NEXT: movb %bl, (%eax,%esi)
+; CHECK-NEXT: movzbl (%edx,%esi,4), %ebx
+; CHECK-NEXT: movb %bl, (%ecx,%esi)
; CHECK-NEXT: incl %esi
; CHECK-NEXT: cmpl %ebp, %esi
; CHECK-NEXT: jb LBB1_10
; CHECK-NEXT: ## %bb.11: ## %bb17
; CHECK-NEXT: ## in Loop: Header=BB1_9 Depth=1
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Reload
-; CHECK-NEXT: incl %ebx
-; CHECK-NEXT: addl %ebp, %ecx
; CHECK-NEXT: movl (%esp), %edx ## 4-byte Reload
-; CHECK-NEXT: addl $2, %edx
+; CHECK-NEXT: incl %edx
+; CHECK-NEXT: addl %ebp, %ecx
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi ## 4-byte Reload
+; CHECK-NEXT: addl $2, %esi
; CHECK-NEXT: addl %ebp, %eax
-; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ebx ## 4-byte Folded Reload
+; CHECK-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %edx ## 4-byte Folded Reload
; CHECK-NEXT: jb LBB1_9
; CHECK-NEXT: LBB1_13: ## %bb20
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll
index 420f5ba..31a7f11 100644
--- a/llvm/test/CodeGen/X86/parity.ll
+++ b/llvm/test/CodeGen/X86/parity.ll
@@ -219,12 +219,12 @@ define i64 @parity_64(i64 %x) {
;
; X64-NOPOPCNT-LABEL: parity_64:
; X64-NOPOPCNT: # %bb.0:
-; X64-NOPOPCNT-NEXT: movq %rdi, %rax
-; X64-NOPOPCNT-NEXT: shrq $32, %rax
-; X64-NOPOPCNT-NEXT: xorl %edi, %eax
-; X64-NOPOPCNT-NEXT: movl %eax, %ecx
+; X64-NOPOPCNT-NEXT: movl %edi, %eax
+; X64-NOPOPCNT-NEXT: shrq $32, %rdi
+; X64-NOPOPCNT-NEXT: xorl %eax, %edi
+; X64-NOPOPCNT-NEXT: movl %edi, %ecx
; X64-NOPOPCNT-NEXT: shrl $16, %ecx
-; X64-NOPOPCNT-NEXT: xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT: xorl %edi, %ecx
; X64-NOPOPCNT-NEXT: xorl %eax, %eax
; X64-NOPOPCNT-NEXT: xorb %ch, %cl
; X64-NOPOPCNT-NEXT: setnp %al
@@ -264,12 +264,12 @@ define i32 @parity_64_trunc(i64 %x) {
;
; X64-NOPOPCNT-LABEL: parity_64_trunc:
; X64-NOPOPCNT: # %bb.0:
-; X64-NOPOPCNT-NEXT: movq %rdi, %rax
-; X64-NOPOPCNT-NEXT: shrq $32, %rax
-; X64-NOPOPCNT-NEXT: xorl %edi, %eax
-; X64-NOPOPCNT-NEXT: movl %eax, %ecx
+; X64-NOPOPCNT-NEXT: movl %edi, %eax
+; X64-NOPOPCNT-NEXT: shrq $32, %rdi
+; X64-NOPOPCNT-NEXT: xorl %eax, %edi
+; X64-NOPOPCNT-NEXT: movl %edi, %ecx
; X64-NOPOPCNT-NEXT: shrl $16, %ecx
-; X64-NOPOPCNT-NEXT: xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT: xorl %edi, %ecx
; X64-NOPOPCNT-NEXT: xorl %eax, %eax
; X64-NOPOPCNT-NEXT: xorb %ch, %cl
; X64-NOPOPCNT-NEXT: setnp %al
@@ -628,12 +628,12 @@ define i64 @parity_64_shift(i64 %0) {
;
; X64-NOPOPCNT-LABEL: parity_64_shift:
; X64-NOPOPCNT: # %bb.0:
-; X64-NOPOPCNT-NEXT: movq %rdi, %rax
-; X64-NOPOPCNT-NEXT: shrq $32, %rax
-; X64-NOPOPCNT-NEXT: xorl %edi, %eax
-; X64-NOPOPCNT-NEXT: movl %eax, %ecx
+; X64-NOPOPCNT-NEXT: movl %edi, %eax
+; X64-NOPOPCNT-NEXT: shrq $32, %rdi
+; X64-NOPOPCNT-NEXT: xorl %eax, %edi
+; X64-NOPOPCNT-NEXT: movl %edi, %ecx
; X64-NOPOPCNT-NEXT: shrl $16, %ecx
-; X64-NOPOPCNT-NEXT: xorl %eax, %ecx
+; X64-NOPOPCNT-NEXT: xorl %edi, %ecx
; X64-NOPOPCNT-NEXT: xorl %eax, %eax
; X64-NOPOPCNT-NEXT: xorb %ch, %cl
; X64-NOPOPCNT-NEXT: setnp %al
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 00731fe..a1808e4 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -10,7 +10,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,u,117,u,117,u,117,u,117,u,117,u,117,u,117,u]
; SSE2-NEXT: pmullw %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm1
@@ -25,7 +25,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -160,16 +160,14 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind {
;
; SSE41-LABEL: mul_v16i8:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pmaddubsw %xmm3, %xmm4
-; SSE41-NEXT: pand %xmm2, %xmm4
-; SSE41-NEXT: pandn %xmm1, %xmm2
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm2
+; SSE41-NEXT: pmullw %xmm1, %xmm2
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm3, %xmm2
+; SSE41-NEXT: pandn %xmm1, %xmm3
+; SSE41-NEXT: pmaddubsw %xmm3, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v16i8:
@@ -380,7 +378,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [117,117,117,117,117,117,117,117]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [117,u,117,u,117,u,117,u,117,u,117,u,117,u,117,u]
; SSE2-NEXT: pmullw %xmm3, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm4, %xmm2
@@ -400,28 +398,27 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
;
; SSE41-LABEL: mul_v32i8c:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm3
+; SSE41-NEXT: pmullw %xmm2, %xmm3
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: pand %xmm4, %xmm3
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; SSE41-NEXT: pmaddubsw %xmm5, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm3
-; SSE41-NEXT: pand %xmm4, %xmm3
+; SSE41-NEXT: pmullw %xmm1, %xmm2
+; SSE41-NEXT: pand %xmm4, %xmm2
; SSE41-NEXT: pmaddubsw %xmm5, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm3, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v32i8c:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -430,7 +427,7 @@ define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind {
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -584,49 +581,44 @@ define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind {
;
; SSE41-LABEL: mul_v32i8:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pandn %xmm2, %xmm5
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm6
-; SSE41-NEXT: pand %xmm4, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm5, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pmullw %xmm2, %xmm4
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE41-NEXT: pand %xmm5, %xmm4
+; SSE41-NEXT: movdqa %xmm5, %xmm6
+; SSE41-NEXT: pandn %xmm2, %xmm6
+; SSE41-NEXT: pmaddubsw %xmm6, %xmm0
; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm2, %xmm5
-; SSE41-NEXT: pand %xmm4, %xmm5
-; SSE41-NEXT: pandn %xmm3, %xmm4
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm1
+; SSE41-NEXT: por %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pmullw %xmm3, %xmm2
+; SSE41-NEXT: pand %xmm5, %xmm2
+; SSE41-NEXT: pandn %xmm3, %xmm5
+; SSE41-NEXT: pmaddubsw %xmm5, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm5, %xmm1
+; SSE41-NEXT: por %xmm2, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v32i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpandn %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: mul_v32i8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm3
-; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpandn %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v32i8:
@@ -737,7 +729,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,u,117,u,117,u,117,u,117,u,117,u,117,u,117,u]
; SSE2-NEXT: pmullw %xmm4, %xmm6
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm5, %xmm6
@@ -773,9 +765,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
;
; SSE41-LABEL: mul_v64i8c:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; SSE41-NEXT: movdqa %xmm0, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm6
+; SSE41-NEXT: pmullw %xmm4, %xmm6
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; SSE41-NEXT: pand %xmm5, %xmm6
; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
@@ -783,36 +775,35 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; SSE41-NEXT: psllw $8, %xmm0
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: movdqa %xmm1, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm6
+; SSE41-NEXT: pmullw %xmm4, %xmm6
; SSE41-NEXT: pand %xmm5, %xmm6
; SSE41-NEXT: pmaddubsw %xmm7, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
; SSE41-NEXT: por %xmm6, %xmm1
; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm6
+; SSE41-NEXT: pmullw %xmm4, %xmm6
; SSE41-NEXT: pand %xmm5, %xmm6
; SSE41-NEXT: pmaddubsw %xmm7, %xmm2
; SSE41-NEXT: psllw $8, %xmm2
; SSE41-NEXT: por %xmm6, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm6
-; SSE41-NEXT: pand %xmm5, %xmm6
+; SSE41-NEXT: pmullw %xmm3, %xmm4
+; SSE41-NEXT: pand %xmm5, %xmm4
; SSE41-NEXT: pmaddubsw %xmm7, %xmm3
; SSE41-NEXT: psllw $8, %xmm3
-; SSE41-NEXT: por %xmm6, %xmm3
+; SSE41-NEXT: por %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v64i8c:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
-; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3
; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -822,9 +813,9 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
; AVX512F-LABEL: mul_v64i8c:
; AVX512F: # %bb.0: # %entry
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
-; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3
-; AVX512F-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
+; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm0
@@ -837,7 +828,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind {
;
; AVX512BW-LABEL: mul_v64i8c:
; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117]
; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
@@ -899,59 +890,52 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
;
; SSE41-LABEL: mul_v64i8:
; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: movdqa %xmm0, %xmm9
+; SSE41-NEXT: pmullw %xmm4, %xmm9
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: movdqa %xmm8, %xmm9
-; SSE41-NEXT: pandn %xmm4, %xmm9
-; SSE41-NEXT: pand %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm10
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm10
-; SSE41-NEXT: pand %xmm8, %xmm10
-; SSE41-NEXT: pmaddubsw %xmm9, %xmm0
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: por %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm8, %xmm4
-; SSE41-NEXT: pandn %xmm5, %xmm4
-; SSE41-NEXT: pand %xmm8, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm9
-; SSE41-NEXT: pmaddubsw %xmm5, %xmm9
; SSE41-NEXT: pand %xmm8, %xmm9
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm8, %xmm10
+; SSE41-NEXT: pandn %xmm4, %xmm10
+; SSE41-NEXT: pmaddubsw %xmm10, %xmm0
+; SSE41-NEXT: psllw $8, %xmm0
+; SSE41-NEXT: por %xmm9, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pmullw %xmm5, %xmm4
+; SSE41-NEXT: pand %xmm8, %xmm4
+; SSE41-NEXT: movdqa %xmm8, %xmm9
+; SSE41-NEXT: pandn %xmm5, %xmm9
+; SSE41-NEXT: pmaddubsw %xmm9, %xmm1
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: por %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm8, %xmm4
-; SSE41-NEXT: pandn %xmm6, %xmm4
-; SSE41-NEXT: pand %xmm8, %xmm6
-; SSE41-NEXT: movdqa %xmm2, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm6, %xmm5
-; SSE41-NEXT: pand %xmm8, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm2
+; SSE41-NEXT: por %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm2, %xmm4
+; SSE41-NEXT: pmullw %xmm6, %xmm4
+; SSE41-NEXT: pand %xmm8, %xmm4
+; SSE41-NEXT: movdqa %xmm8, %xmm5
+; SSE41-NEXT: pandn %xmm6, %xmm5
+; SSE41-NEXT: pmaddubsw %xmm5, %xmm2
; SSE41-NEXT: psllw $8, %xmm2
-; SSE41-NEXT: por %xmm5, %xmm2
-; SSE41-NEXT: movdqa %xmm7, %xmm4
+; SSE41-NEXT: por %xmm4, %xmm2
+; SSE41-NEXT: movdqa %xmm3, %xmm4
+; SSE41-NEXT: pmullw %xmm7, %xmm4
; SSE41-NEXT: pand %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: pmaddubsw %xmm4, %xmm5
-; SSE41-NEXT: pand %xmm8, %xmm5
; SSE41-NEXT: pandn %xmm7, %xmm8
; SSE41-NEXT: pmaddubsw %xmm8, %xmm3
; SSE41-NEXT: psllw $8, %xmm3
-; SSE41-NEXT: por %xmm5, %xmm3
+; SSE41-NEXT: por %xmm4, %xmm3
; SSE41-NEXT: retq
;
; AVX2-LABEL: mul_v64i8:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
-; AVX2-NEXT: vpand %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2
+; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm4
+; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpandn %ymm2, %ymm5, %ymm2
; AVX2-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2
-; AVX2-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpandn %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vpandn %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
@@ -959,33 +943,30 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind {
;
; AVX512F-LABEL: mul_v64i8:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
-; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm6
-; AVX512F-NEXT: vpmaddubsw %ymm6, %ymm0, %ymm6
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm6, %zmm4
-; AVX512F-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm5 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX512F-NEXT: vpandn %ymm1, %ymm5, %ymm1
; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm1
-; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm5, %ymm1
+; AVX512F-NEXT: vpandn %ymm2, %ymm5, %ymm1
+; AVX512F-NEXT: vpmaddubsw %ymm1, %ymm3, %ymm1
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm2)
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm5)
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: mul_v64i8:
; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpandnq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpandnq %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
-; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
+; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm2 & zmm3)
; AVX512BW-NEXT: retq
entry:
%A = mul <64 x i8> %i, %j
diff --git a/llvm/test/CodeGen/X86/pr114360.ll b/llvm/test/CodeGen/X86/pr114360.ll
index cf51085..41cf06a 100644
--- a/llvm/test/CodeGen/X86/pr114360.ll
+++ b/llvm/test/CodeGen/X86/pr114360.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; REQUIRES: asserts
; RUN: llc < %s -mtriple=x86_64-- -debug-counter=dagcombine=0 | FileCheck %s
; BUG: shrinkAndImmediate folds away the AND after the ZEXT has already been folded away to SUBREG_TO_REG losing implicit zext.
diff --git a/llvm/test/CodeGen/X86/pr165755.ll b/llvm/test/CodeGen/X86/pr165755.ll
new file mode 100644
index 0000000..3ab484f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr165755.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64
+
+define i32 @PR165755(ptr %p0) {
+; X86-LABEL: PR165755:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %eax
+; X86-NEXT: movb $0, (%ecx)
+; X86-NEXT: retl
+;
+; X64-LABEL: PR165755:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: movb $0, (%rdi)
+; X64-NEXT: retq
+ %ld64 = load i64, ptr %p0, align 8
+ store i8 0, ptr %p0, align 1
+ %ld32 = load i32, ptr %p0, align 8
+ %mask = and i32 %ld32, 32
+ %zext = zext i32 %mask to i64
+ %srl = lshr i64 %ld64, %zext
+ %res = trunc i64 %srl to i32
+ ret i32 %res
+}
diff --git a/llvm/test/CodeGen/X86/pr166058.ll b/llvm/test/CodeGen/X86/pr166058.ll
new file mode 100644
index 0000000..42d68fd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166058.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+@out = global i32 0, align 4
+define void @bar() {
+; CHECK-LABEL: bar:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq out@GOTPCREL(%rip), %rax
+; CHECK-NEXT: #APP
+; CHECK-NEXT: addl $-1, (%rax)
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: retq
+ call void asm "addl $1,$0", "=*m,L,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) @out, i32 -1)
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
new file mode 100644
index 0000000..162a0c9
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
+; SSE2-LABEL: pr166534:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu (%rsi), %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %esi
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: orq %rax, (%rdx)
+; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT: jne .LBB0_2
+; SSE2-NEXT: # %bb.1: # %if.then
+; SSE2-NEXT: orq %rax, (%rcx)
+; SSE2-NEXT: .LBB0_2: # %if.end
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: pr166534:
+; SSE4: # %bb.0: # %entry
+; SSE4-NEXT: movdqu (%rdi), %xmm0
+; SSE4-NEXT: movdqu (%rsi), %xmm1
+; SSE4-NEXT: pxor %xmm0, %xmm1
+; SSE4-NEXT: xorl %eax, %eax
+; SSE4-NEXT: ptest %xmm1, %xmm1
+; SSE4-NEXT: sete %al
+; SSE4-NEXT: orq %rax, (%rdx)
+; SSE4-NEXT: ptest %xmm1, %xmm1
+; SSE4-NEXT: jne .LBB0_2
+; SSE4-NEXT: # %bb.1: # %if.then
+; SSE4-NEXT: orq %rax, (%rcx)
+; SSE4-NEXT: .LBB0_2: # %if.end
+; SSE4-NEXT: retq
+;
+; AVX2-LABEL: pr166534:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: orq %rax, (%rdx)
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: jne .LBB0_2
+; AVX2-NEXT: # %bb.1: # %if.then
+; AVX2-NEXT: orq %rax, (%rcx)
+; AVX2-NEXT: .LBB0_2: # %if.end
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: pr166534:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: orq %rax, (%rdx)
+; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: jne .LBB0_2
+; AVX512-NEXT: # %bb.1: # %if.then
+; AVX512-NEXT: orq %rax, (%rcx)
+; AVX512-NEXT: .LBB0_2: # %if.end
+; AVX512-NEXT: retq
+entry:
+ %a = load i128, ptr %pa, align 8
+ %b = load i128, ptr %pb, align 8
+ %cmp = icmp eq i128 %a, %b
+ %conv1 = zext i1 %cmp to i128
+ %c = load i128, ptr %pc, align 8
+ %or = or i128 %c, %conv1
+ store i128 %or, ptr %pc, align 8
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %d = load i128, ptr %pd, align 8
+ %or7 = or i128 %d, %conv1
+ store i128 %or7, ptr %pd, align 8
+ br label %if.end
+
+if.end:
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
new file mode 100644
index 0000000..ffdb68c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA
+
+; Ensure reloads are after narrowed i512 -> i32 store
+define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
+; POSTRA-LABEL: PR166744:
+; POSTRA: # %bb.0:
+; POSTRA-NEXT: movl $1029, %eax # imm = 0x405
+; POSTRA-NEXT: shlxl %esi, %edx, %edx
+; POSTRA-NEXT: bextrl %eax, %esi, %eax
+; POSTRA-NEXT: movl (%rdi,%rax,4), %ecx
+; POSTRA-NEXT: btrl %esi, %ecx
+; POSTRA-NEXT: orl %ecx, %edx
+; POSTRA-NEXT: movl %edx, (%rdi,%rax,4)
+; POSTRA-NEXT: movq 16(%rdi), %rax
+; POSTRA-NEXT: movq (%rdi), %rcx
+; POSTRA-NEXT: movq 24(%rdi), %rdx
+; POSTRA-NEXT: movq 8(%rdi), %rsi
+; POSTRA-NEXT: orq 56(%rdi), %rdx
+; POSTRA-NEXT: orq 40(%rdi), %rsi
+; POSTRA-NEXT: orq 48(%rdi), %rax
+; POSTRA-NEXT: orq 32(%rdi), %rcx
+; POSTRA-NEXT: orq %rdx, %rsi
+; POSTRA-NEXT: orq %rax, %rcx
+; POSTRA-NEXT: orq %rsi, %rcx
+; POSTRA-NEXT: setne %al
+; POSTRA-NEXT: retq
+;
+; NOPOSTRA-LABEL: PR166744:
+; NOPOSTRA: # %bb.0:
+; NOPOSTRA-NEXT: movl %esi, %eax
+; NOPOSTRA-NEXT: shrl $3, %esi
+; NOPOSTRA-NEXT: andl $60, %esi
+; NOPOSTRA-NEXT: movl (%rdi,%rsi), %ecx
+; NOPOSTRA-NEXT: btrl %eax, %ecx
+; NOPOSTRA-NEXT: shlxl %eax, %edx, %eax
+; NOPOSTRA-NEXT: orl %ecx, %eax
+; NOPOSTRA-NEXT: movl %eax, (%rdi,%rsi)
+; NOPOSTRA-NEXT: movq 16(%rdi), %rax
+; NOPOSTRA-NEXT: movq (%rdi), %rcx
+; NOPOSTRA-NEXT: movq 8(%rdi), %rdx
+; NOPOSTRA-NEXT: movq 24(%rdi), %rsi
+; NOPOSTRA-NEXT: orq 56(%rdi), %rsi
+; NOPOSTRA-NEXT: orq 40(%rdi), %rdx
+; NOPOSTRA-NEXT: orq 48(%rdi), %rax
+; NOPOSTRA-NEXT: orq 32(%rdi), %rcx
+; NOPOSTRA-NEXT: orq %rsi, %rdx
+; NOPOSTRA-NEXT: orq %rax, %rcx
+; NOPOSTRA-NEXT: orq %rdx, %rcx
+; NOPOSTRA-NEXT: setne %al
+; NOPOSTRA-NEXT: retq
+ %rem = and i64 %idx, 511
+ %sh_prom = zext nneg i64 %rem to i512
+ %shl = shl nuw i512 1, %sh_prom
+ %not = xor i512 %shl, -1
+ %load = load i512, ptr %v, align 8
+ %and = and i512 %load, %not
+ %conv2 = zext i1 %b to i512
+ %shl4 = shl nuw i512 %conv2, %sh_prom
+ %or = or i512 %and, %shl4
+ store i512 %or, ptr %v, align 8
+ %cmp = icmp ne i512 %or, 0
+ ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/X86/pr167793.ll b/llvm/test/CodeGen/X86/pr167793.ll
new file mode 100644
index 0000000..9b394bf
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr167793.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=btver2 | FileCheck %s
+
+define <4 x double> @PR167793(<4 x double> %a0, <4 x double> %a1) {
+; CHECK-LABEL: PR167793:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vhaddpd %ymm0, %ymm0, %ymm0
+; CHECK-NEXT: vhaddpd %ymm1, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3]
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0
+; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; CHECK-NEXT: retq
+ %i5 = shufflevector <4 x double> %a0, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %i6 = fadd <4 x double> %a0, %i5
+ %i8 = shufflevector <4 x double> %a1, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+ %i9 = fadd <4 x double> %a1, %i8
+ %i10 = shufflevector <4 x double> %i6, <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+ %i11 = shufflevector <4 x double> %i6, <4 x double> poison, <2 x i32> <i32 poison, i32 1>
+ %i12 = fadd <2 x double> %i10, %i11
+ %i13 = shufflevector <4 x double> %i9, <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+ %i14 = shufflevector <4 x double> %i9, <4 x double> poison, <2 x i32> <i32 poison, i32 1>
+ %i15 = fadd <2 x double> %i13, %i14
+ %i16 = shufflevector <4 x double> zeroinitializer, <4 x double> poison, <2 x i32> <i32 poison, i32 1>
+ %i18 = shufflevector <2 x double> %i15, <2 x double> %i16, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+ %i19 = shufflevector <2 x double> %i12, <2 x double> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+ %i20 = shufflevector <4 x double> %i19, <4 x double> %i18, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+ ret <4 x double> %i20
+}
diff --git a/llvm/test/CodeGen/X86/pr168594.ll b/llvm/test/CodeGen/X86/pr168594.ll
new file mode 100644
index 0000000..76bb132
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr168594.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX
+
+define <8 x i16> @PR168594() {
+; SSE-LABEL: PR168594:
+; SSE: # %bb.0:
+; SSE-NEXT: pxor %xmm0, %xmm0
+; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: PR168594:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %call = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> splat (i16 1), <8 x i16> zeroinitializer)
+ %sub = sub <8 x i16> zeroinitializer, %call
+ ret <8 x i16> %sub
+}
diff --git a/llvm/test/CodeGen/X86/pr169205.ll b/llvm/test/CodeGen/X86/pr169205.ll
new file mode 100644
index 0000000..1416102
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr169205.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX
+
+define <4 x i16> @PR169205() {
+; SSE-LABEL: PR169205:
+; SSE: # %bb.0:
+; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1,u,u,u,u]
+; SSE-NEXT: retq
+;
+; AVX-LABEL: PR169205:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: retq
+ %avg = tail call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer)
+ %shuffle24 = shufflevector <16 x i8> %avg, <16 x i8> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 9, i32 9>
+ %conv25 = zext <4 x i8> %shuffle24 to <4 x i16>
+ %not.neg = add <4 x i16> %conv25, splat (i16 1)
+ ret <4 x i16> %not.neg
+}
diff --git a/llvm/test/CodeGen/X86/pr49451.ll b/llvm/test/CodeGen/X86/pr49451.ll
index 173c411..1a7551f 100644
--- a/llvm/test/CodeGen/X86/pr49451.ll
+++ b/llvm/test/CodeGen/X86/pr49451.ll
@@ -18,15 +18,15 @@ define void @func_6(i8 %uc_8, i64 %uli_10) nounwind {
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB0_1: # %for.body612
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: testb %dl, %dl
+; X86-NEXT: testb %bl, %bl
; X86-NEXT: je .LBB0_2
; X86-NEXT: # %bb.3: # %if.end1401
; X86-NEXT: # in Loop: Header=BB0_1 Depth=1
; X86-NEXT: addl %eax, %esi
; X86-NEXT: movw %si, s_2
-; X86-NEXT: movw %bx, s_0
+; X86-NEXT: movw %dx, s_0
; X86-NEXT: incl %ecx
-; X86-NEXT: incl %ebx
+; X86-NEXT: incl %edx
; X86-NEXT: cmpw $73, %cx
; X86-NEXT: jl .LBB0_1
; X86-NEXT: # %bb.4: # %for.body1703
diff --git a/llvm/test/CodeGen/X86/pr63790.ll b/llvm/test/CodeGen/X86/pr63790.ll
new file mode 100644
index 0000000..e4e7a3c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr63790.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+
+define void @f(ptr %0, i64 %1) {
+; CHECK-LABEL: f:
+; CHECK: # %bb.0: # %BB
+; CHECK-NEXT: subq $40, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: andl $1, %esi
+; CHECK-NEXT: movaps (%rdi), %xmm0
+; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $42, %edi
+; CHECK-NEXT: callq *16(%rsp,%rsi,8)
+; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: movaps %xmm0, (%rax)
+; CHECK-NEXT: addq $40, %rsp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+BB:
+ %fps = load <2 x ptr>, ptr %0
+ %fp = extractelement <2 x ptr> %fps, i64 %1
+ %p = call ptr %fp(i32 42)
+ store <2 x ptr> %fps, ptr %p
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
index ad08eaf..7e00d67 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
@@ -43,25 +43,23 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) {
; AVX256-LABEL: testv16i1_sext_v16i8:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX256-NEXT: vmovdqa (%rsi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
+; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
+; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
+; AVX256-NEXT: kshiftrw $8, %k1, %k1
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
-; AVX256-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX256-NEXT: vpacksswb %xmm0, %xmm1, %xmm0
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
;
; AVX512VL-LABEL: testv16i1_sext_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
-; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -70,10 +68,8 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) {
; AVX512F-LABEL: testv16i1_sext_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -91,13 +87,13 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) {
; AVX256-LABEL: testv16i1_sext_v16i16:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX256-NEXT: vmovdqa (%rsi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
+; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
-; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z}
+; AVX256-NEXT: kshiftrw $8, %k1, %k1
+; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX256-NEXT: retq
@@ -105,10 +101,8 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) {
; AVX512VL-LABEL: testv16i1_sext_v16i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
-; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VL-NEXT: retq
@@ -116,10 +110,8 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) {
; AVX512F-LABEL: testv16i1_sext_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: retq
@@ -173,27 +165,25 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) {
; AVX256-LABEL: testv16i1_zext_v16i8:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX256-NEXT: vmovdqa (%rsi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
+; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
+; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
; AVX256-NEXT: vpsrlw $15, %xmm1, %xmm1
+; AVX256-NEXT: kshiftrw $8, %k1, %k1
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0
-; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX256-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
;
; AVX512VL-LABEL: testv16i1_zext_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
-; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
@@ -202,10 +192,8 @@ define <16 x i8> @testv16i1_zext_v16i8(ptr %p, ptr %q) {
; AVX512F-LABEL: testv16i1_zext_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -223,13 +211,13 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) {
; AVX256-LABEL: testv16i1_zext_v16i16:
; AVX256: # %bb.0:
; AVX256-NEXT: vmovdqa (%rdi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX256-NEXT: vmovdqa (%rsi), %ymm0
-; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
+; AVX256-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX256-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
-; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k2} {z}
+; AVX256-NEXT: kshiftrw $8, %k1, %k1
+; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
; AVX256-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX256-NEXT: vpsrlw $15, %ymm0, %ymm0
@@ -238,10 +226,8 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) {
; AVX512VL-LABEL: testv16i1_zext_v16i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
-; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
-; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
; AVX512VL-NEXT: vpsrlw $15, %ymm0, %ymm0
@@ -250,10 +236,8 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) {
; AVX512F-LABEL: testv16i1_zext_v16i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1
-; AVX512F-NEXT: kunpckbw %k0, %k1, %k1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
index 3699c7f7..9338434 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
@@ -18,26 +18,23 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
-; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} {z}
-; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
-; AVX256VL-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
-; AVX256VL-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1]
-; AVX256VL-NEXT: vpmovsxwd %xmm3, %ymm3
-; AVX256VL-NEXT: vpslld $31, %ymm3, %ymm3
-; AVX256VL-NEXT: vptestmd %ymm3, %ymm3, %k1
-; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; AVX256VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,12,13,2,3,u,u,6,7,u,u,14,15,0,1]
-; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4],xmm1[5],xmm2[6,7]
-; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
-; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
-; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0
-; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0
-; AVX256VL-NEXT: kshiftrw $8, %k0, %k2
-; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
+; AVX256VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,1,3]
+; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm3 {%k1} {z}
+; AVX256VL-NEXT: vpmovdw %ymm3, %xmm3
+; AVX256VL-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[6,7,12,13,2,3,u,u,6,7,u,u,14,15,0,1]
+; AVX256VL-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3],xmm4[4],xmm2[5],xmm4[6,7]
+; AVX256VL-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4],xmm3[5,6,7]
+; AVX256VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,12,13,4,5,8,9,6,7,14,15,14,15,0,1]
+; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX256VL-NEXT: vpmovsxwd %ymm1, %zmm1
+; AVX256VL-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX256VL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
+; AVX256VL-NEXT: kshiftrw $8, %k1, %k1
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX256VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm0
; AVX256VL-NEXT: vzeroupper
; AVX256VL-NEXT: retq
;
@@ -135,14 +132,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX256VL-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1
; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX256VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX256VL-NEXT: vpmovsxbd %xmm1, %ymm1
-; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k2
-; AVX256VL-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX256VL-NEXT: vptestmd %ymm0, %ymm0, %k3
+; AVX256VL-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX256VL-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k3} {z}
+; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
+; AVX256VL-NEXT: kshiftrw $8, %k2, %k2
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm2 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
; AVX256VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
@@ -153,20 +148,15 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
; AVX256VL-NEXT: vpmovdw %ymm2, %xmm2
; AVX256VL-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
; AVX256VL-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm1
-; AVX256VL-NEXT: vpmovsxwd %xmm2, %ymm1
-; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
-; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k1
-; AVX256VL-NEXT: vextracti128 $1, %ymm2, %xmm1
-; AVX256VL-NEXT: vpmovsxwd %xmm1, %ymm1
-; AVX256VL-NEXT: vpslld $31, %ymm1, %ymm1
-; AVX256VL-NEXT: vptestmd %ymm1, %ymm1, %k0
-; AVX256VL-NEXT: kunpckbw %k1, %k0, %k0
-; AVX256VL-NEXT: kshiftrw $8, %k0, %k2
-; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
+; AVX256VL-NEXT: vpmovsxwd %ymm2, %zmm1
+; AVX256VL-NEXT: vpslld $31, %zmm1, %zmm1
+; AVX256VL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
+; AVX256VL-NEXT: kshiftrw $8, %k1, %k1
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX256VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX256VL-NEXT: vpacksswb %xmm0, %xmm1, %xmm0
; AVX256VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX256VL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index 59b03f8..c9e48f8 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -58,13 +58,12 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) {
define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) {
; AVX256BW-LABEL: test_mul_32i8:
; AVX256BW: # %bb.0:
-; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX256BW-NEXT: vpand %ymm2, %ymm1, %ymm3
-; AVX256BW-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
-; AVX256BW-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX256BW-NEXT: vpmullw %ymm1, %ymm0, %ymm2
+; AVX256BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX256BW-NEXT: vpandn %ymm1, %ymm3, %ymm1
; AVX256BW-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
; AVX256BW-NEXT: vpsllw $8, %ymm0, %ymm0
-; AVX256BW-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm2)
+; AVX256BW-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm3)
; AVX256BW-NEXT: retq
;
; AVX512BWVL-LABEL: test_mul_32i8:
diff --git a/llvm/test/CodeGen/X86/regalloc-fp.ll b/llvm/test/CodeGen/X86/regalloc-fp.ll
new file mode 100644
index 0000000..e89e5ab1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/regalloc-fp.ll
@@ -0,0 +1,775 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; Context:
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+define i32 @check_none() "frame-pointer"="none" {
+; CHECK-LABEL: check_none:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %reg0 = alloca i32, align 4
+ %reg1 = alloca i32, align 4
+ %reg2 = alloca i32, align 4
+ %reg3 = alloca i32, align 4
+ %reg4 = alloca i32, align 4
+ %reg5 = alloca i32, align 4
+ %reg6 = alloca i32, align 4
+ %reg7 = alloca i32, align 4
+ %reg8 = alloca i32, align 4
+ %reg9 = alloca i32, align 4
+ %reg10 = alloca i32, align 4
+ %reg11 = alloca i32, align 4
+ %reg12 = alloca i32, align 4
+ %reg13 = alloca i32, align 4
+ %reg14 = alloca i32, align 4
+ store volatile i32 0, ptr %reg0, align 4
+ store volatile i32 1, ptr %reg1, align 4
+ store volatile i32 2, ptr %reg2, align 4
+ store volatile i32 3, ptr %reg3, align 4
+ store volatile i32 4, ptr %reg4, align 4
+ store volatile i32 5, ptr %reg5, align 4
+ store volatile i32 6, ptr %reg6, align 4
+ store volatile i32 7, ptr %reg7, align 4
+ store volatile i32 8, ptr %reg8, align 4
+ store volatile i32 9, ptr %reg9, align 4
+ store volatile i32 16, ptr %reg10, align 4
+ store volatile i32 17, ptr %reg11, align 4
+ store volatile i32 18, ptr %reg12, align 4
+ store volatile i32 19, ptr %reg13, align 4
+ store volatile i32 20, ptr %reg14, align 4
+ %0 = load volatile i32, ptr %reg0, align 4
+ %1 = load volatile i32, ptr %reg1, align 4
+ %2 = load volatile i32, ptr %reg2, align 4
+ %3 = load volatile i32, ptr %reg3, align 4
+ %4 = load volatile i32, ptr %reg4, align 4
+ %5 = load volatile i32, ptr %reg5, align 4
+ %6 = load volatile i32, ptr %reg6, align 4
+ %7 = load volatile i32, ptr %reg7, align 4
+ %8 = load volatile i32, ptr %reg8, align 4
+ %9 = load volatile i32, ptr %reg9, align 4
+ %10 = load volatile i32, ptr %reg10, align 4
+ %11 = load volatile i32, ptr %reg11, align 4
+ %12 = load volatile i32, ptr %reg12, align 4
+ %13 = load volatile i32, ptr %reg13, align 4
+ %14 = load volatile i32, ptr %reg14, align 4
+ %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1
+ %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0
+ %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1
+ %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2
+ %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3
+ %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4
+ %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5
+ %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6
+ %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7
+ %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8
+ %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9
+ %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10
+ %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11
+ %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12
+ %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13
+ %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14
+ store volatile i32 %asmresult, ptr %reg0, align 4
+ store volatile i32 %asmresult1, ptr %reg1, align 4
+ store volatile i32 %asmresult2, ptr %reg2, align 4
+ store volatile i32 %asmresult3, ptr %reg3, align 4
+ store volatile i32 %asmresult4, ptr %reg4, align 4
+ store volatile i32 %asmresult5, ptr %reg5, align 4
+ store volatile i32 %asmresult6, ptr %reg6, align 4
+ store volatile i32 %asmresult7, ptr %reg7, align 4
+ store volatile i32 %asmresult8, ptr %reg8, align 4
+ store volatile i32 %asmresult9, ptr %reg9, align 4
+ store volatile i32 %asmresult10, ptr %reg10, align 4
+ store volatile i32 %asmresult11, ptr %reg11, align 4
+ store volatile i32 %asmresult12, ptr %reg12, align 4
+ store volatile i32 %asmresult13, ptr %reg13, align 4
+ store volatile i32 %asmresult14, ptr %reg14, align 4
+ ret i32 0
+}
+
+define i32 @test_non_leaf_no_reserve() "frame-pointer"="non-leaf-no-reserve" {
+; CHECK-LABEL: test_non_leaf_no_reserve:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 56
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $20, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebp
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ebp, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %reg0 = alloca i32, align 4
+ %reg1 = alloca i32, align 4
+ %reg2 = alloca i32, align 4
+ %reg3 = alloca i32, align 4
+ %reg4 = alloca i32, align 4
+ %reg5 = alloca i32, align 4
+ %reg6 = alloca i32, align 4
+ %reg7 = alloca i32, align 4
+ %reg8 = alloca i32, align 4
+ %reg9 = alloca i32, align 4
+ %reg10 = alloca i32, align 4
+ %reg11 = alloca i32, align 4
+ %reg12 = alloca i32, align 4
+ %reg13 = alloca i32, align 4
+ %reg14 = alloca i32, align 4
+ store volatile i32 0, ptr %reg0, align 4
+ store volatile i32 1, ptr %reg1, align 4
+ store volatile i32 2, ptr %reg2, align 4
+ store volatile i32 3, ptr %reg3, align 4
+ store volatile i32 4, ptr %reg4, align 4
+ store volatile i32 5, ptr %reg5, align 4
+ store volatile i32 6, ptr %reg6, align 4
+ store volatile i32 7, ptr %reg7, align 4
+ store volatile i32 8, ptr %reg8, align 4
+ store volatile i32 9, ptr %reg9, align 4
+ store volatile i32 16, ptr %reg10, align 4
+ store volatile i32 17, ptr %reg11, align 4
+ store volatile i32 18, ptr %reg12, align 4
+ store volatile i32 19, ptr %reg13, align 4
+ store volatile i32 20, ptr %reg14, align 4
+ %0 = load volatile i32, ptr %reg0, align 4
+ %1 = load volatile i32, ptr %reg1, align 4
+ %2 = load volatile i32, ptr %reg2, align 4
+ %3 = load volatile i32, ptr %reg3, align 4
+ %4 = load volatile i32, ptr %reg4, align 4
+ %5 = load volatile i32, ptr %reg5, align 4
+ %6 = load volatile i32, ptr %reg6, align 4
+ %7 = load volatile i32, ptr %reg7, align 4
+ %8 = load volatile i32, ptr %reg8, align 4
+ %9 = load volatile i32, ptr %reg9, align 4
+ %10 = load volatile i32, ptr %reg10, align 4
+ %11 = load volatile i32, ptr %reg11, align 4
+ %12 = load volatile i32, ptr %reg12, align 4
+ %13 = load volatile i32, ptr %reg13, align 4
+ %14 = load volatile i32, ptr %reg14, align 4
+ %15 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, i32 %14) #1
+ %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 0
+ %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 1
+ %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 2
+ %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 3
+ %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 4
+ %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 5
+ %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 6
+ %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 7
+ %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 8
+ %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 9
+ %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 10
+ %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 11
+ %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 12
+ %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 13
+ %asmresult14 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %15, 14
+ store volatile i32 %asmresult, ptr %reg0, align 4
+ store volatile i32 %asmresult1, ptr %reg1, align 4
+ store volatile i32 %asmresult2, ptr %reg2, align 4
+ store volatile i32 %asmresult3, ptr %reg3, align 4
+ store volatile i32 %asmresult4, ptr %reg4, align 4
+ store volatile i32 %asmresult5, ptr %reg5, align 4
+ store volatile i32 %asmresult6, ptr %reg6, align 4
+ store volatile i32 %asmresult7, ptr %reg7, align 4
+ store volatile i32 %asmresult8, ptr %reg8, align 4
+ store volatile i32 %asmresult9, ptr %reg9, align 4
+ store volatile i32 %asmresult10, ptr %reg10, align 4
+ store volatile i32 %asmresult11, ptr %reg11, align 4
+ store volatile i32 %asmresult12, ptr %reg12, align 4
+ store volatile i32 %asmresult13, ptr %reg13, align 4
+ store volatile i32 %asmresult14, ptr %reg14, align 4
+ ret i32 0
+}
+
+define i32 @test_non_leaf() "frame-pointer"="non-leaf" {
+; CHECK-LABEL: test_non_leaf:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset %rbx, -48
+; CHECK-NEXT: .cfi_offset %r12, -40
+; CHECK-NEXT: .cfi_offset %r13, -32
+; CHECK-NEXT: .cfi_offset %r14, -24
+; CHECK-NEXT: .cfi_offset %r15, -16
+; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %reg0 = alloca i32, align 4
+ %reg1 = alloca i32, align 4
+ %reg2 = alloca i32, align 4
+ %reg3 = alloca i32, align 4
+ %reg4 = alloca i32, align 4
+ %reg5 = alloca i32, align 4
+ %reg6 = alloca i32, align 4
+ %reg7 = alloca i32, align 4
+ %reg8 = alloca i32, align 4
+ %reg9 = alloca i32, align 4
+ %reg10 = alloca i32, align 4
+ %reg11 = alloca i32, align 4
+ %reg12 = alloca i32, align 4
+ %reg13 = alloca i32, align 4
+ store volatile i32 0, ptr %reg0, align 4
+ store volatile i32 1, ptr %reg1, align 4
+ store volatile i32 2, ptr %reg2, align 4
+ store volatile i32 3, ptr %reg3, align 4
+ store volatile i32 4, ptr %reg4, align 4
+ store volatile i32 5, ptr %reg5, align 4
+ store volatile i32 6, ptr %reg6, align 4
+ store volatile i32 7, ptr %reg7, align 4
+ store volatile i32 8, ptr %reg8, align 4
+ store volatile i32 9, ptr %reg9, align 4
+ store volatile i32 16, ptr %reg10, align 4
+ store volatile i32 17, ptr %reg11, align 4
+ store volatile i32 18, ptr %reg12, align 4
+ store volatile i32 19, ptr %reg13, align 4
+ %0 = load volatile i32, ptr %reg0, align 4
+ %1 = load volatile i32, ptr %reg1, align 4
+ %2 = load volatile i32, ptr %reg2, align 4
+ %3 = load volatile i32, ptr %reg3, align 4
+ %4 = load volatile i32, ptr %reg4, align 4
+ %5 = load volatile i32, ptr %reg5, align 4
+ %6 = load volatile i32, ptr %reg6, align 4
+ %7 = load volatile i32, ptr %reg7, align 4
+ %8 = load volatile i32, ptr %reg8, align 4
+ %9 = load volatile i32, ptr %reg9, align 4
+ %10 = load volatile i32, ptr %reg10, align 4
+ %11 = load volatile i32, ptr %reg11, align 4
+ %12 = load volatile i32, ptr %reg12, align 4
+ %13 = load volatile i32, ptr %reg13, align 4
+ %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1
+ %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0
+ %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1
+ %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2
+ %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3
+ %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4
+ %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5
+ %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6
+ %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7
+ %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8
+ %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9
+ %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10
+ %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11
+ %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12
+ %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13
+ store volatile i32 %asmresult, ptr %reg0, align 4
+ store volatile i32 %asmresult1, ptr %reg1, align 4
+ store volatile i32 %asmresult2, ptr %reg2, align 4
+ store volatile i32 %asmresult3, ptr %reg3, align 4
+ store volatile i32 %asmresult4, ptr %reg4, align 4
+ store volatile i32 %asmresult5, ptr %reg5, align 4
+ store volatile i32 %asmresult6, ptr %reg6, align 4
+ store volatile i32 %asmresult7, ptr %reg7, align 4
+ store volatile i32 %asmresult8, ptr %reg8, align 4
+ store volatile i32 %asmresult9, ptr %reg9, align 4
+ store volatile i32 %asmresult10, ptr %reg10, align 4
+ store volatile i32 %asmresult11, ptr %reg11, align 4
+ store volatile i32 %asmresult12, ptr %reg12, align 4
+ store volatile i32 %asmresult13, ptr %reg13, align 4
+ ret i32 0
+}
+
+define i32 @test_reserved() "frame-pointer"="reserved" {
+; CHECK-LABEL: test_reserved:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 48
+; CHECK-NEXT: .cfi_offset %rbx, -48
+; CHECK-NEXT: .cfi_offset %r12, -40
+; CHECK-NEXT: .cfi_offset %r13, -32
+; CHECK-NEXT: .cfi_offset %r14, -24
+; CHECK-NEXT: .cfi_offset %r15, -16
+; CHECK-NEXT: movl $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $2, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $3, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $4, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $6, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $7, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $9, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $16, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $17, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $18, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl $19, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r8d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r9d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %ebx
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r14d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r15d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r12d
+; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %r13d
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ecx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %edx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %esi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %edi, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r8d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r9d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r10d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r11d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %ebx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r14d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r15d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r12d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: movl %r13d, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: .cfi_def_cfa_offset 40
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: .cfi_def_cfa_offset 32
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: .cfi_def_cfa_offset 24
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: .cfi_def_cfa_offset 8
+; CHECK-NEXT: retq
+entry:
+ %reg0 = alloca i32, align 4
+ %reg1 = alloca i32, align 4
+ %reg2 = alloca i32, align 4
+ %reg3 = alloca i32, align 4
+ %reg4 = alloca i32, align 4
+ %reg5 = alloca i32, align 4
+ %reg6 = alloca i32, align 4
+ %reg7 = alloca i32, align 4
+ %reg8 = alloca i32, align 4
+ %reg9 = alloca i32, align 4
+ %reg10 = alloca i32, align 4
+ %reg11 = alloca i32, align 4
+ %reg12 = alloca i32, align 4
+ %reg13 = alloca i32, align 4
+ store volatile i32 0, ptr %reg0, align 4
+ store volatile i32 1, ptr %reg1, align 4
+ store volatile i32 2, ptr %reg2, align 4
+ store volatile i32 3, ptr %reg3, align 4
+ store volatile i32 4, ptr %reg4, align 4
+ store volatile i32 5, ptr %reg5, align 4
+ store volatile i32 6, ptr %reg6, align 4
+ store volatile i32 7, ptr %reg7, align 4
+ store volatile i32 8, ptr %reg8, align 4
+ store volatile i32 9, ptr %reg9, align 4
+ store volatile i32 16, ptr %reg10, align 4
+ store volatile i32 17, ptr %reg11, align 4
+ store volatile i32 18, ptr %reg12, align 4
+ store volatile i32 19, ptr %reg13, align 4
+ %0 = load volatile i32, ptr %reg0, align 4
+ %1 = load volatile i32, ptr %reg1, align 4
+ %2 = load volatile i32, ptr %reg2, align 4
+ %3 = load volatile i32, ptr %reg3, align 4
+ %4 = load volatile i32, ptr %reg4, align 4
+ %5 = load volatile i32, ptr %reg5, align 4
+ %6 = load volatile i32, ptr %reg6, align 4
+ %7 = load volatile i32, ptr %reg7, align 4
+ %8 = load volatile i32, ptr %reg8, align 4
+ %9 = load volatile i32, ptr %reg9, align 4
+ %10 = load volatile i32, ptr %reg10, align 4
+ %11 = load volatile i32, ptr %reg11, align 4
+ %12 = load volatile i32, ptr %reg12, align 4
+ %13 = load volatile i32, ptr %reg13, align 4
+ %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1
+ %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0
+ %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1
+ %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2
+ %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3
+ %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4
+ %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5
+ %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6
+ %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7
+ %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8
+ %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9
+ %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10
+ %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11
+ %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12
+ %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13
+ store volatile i32 %asmresult, ptr %reg0, align 4
+ store volatile i32 %asmresult1, ptr %reg1, align 4
+ store volatile i32 %asmresult2, ptr %reg2, align 4
+ store volatile i32 %asmresult3, ptr %reg3, align 4
+ store volatile i32 %asmresult4, ptr %reg4, align 4
+ store volatile i32 %asmresult5, ptr %reg5, align 4
+ store volatile i32 %asmresult6, ptr %reg6, align 4
+ store volatile i32 %asmresult7, ptr %reg7, align 4
+ store volatile i32 %asmresult8, ptr %reg8, align 4
+ store volatile i32 %asmresult9, ptr %reg9, align 4
+ store volatile i32 %asmresult10, ptr %reg10, align 4
+ store volatile i32 %asmresult11, ptr %reg11, align 4
+ store volatile i32 %asmresult12, ptr %reg12, align 4
+ store volatile i32 %asmresult13, ptr %reg13, align 4
+ ret i32 0
+}
+
+define i32 @test_all() "frame-pointer"="all" {
+; CHECK-LABEL: test_all:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset %rbp, -16
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: .cfi_def_cfa_register %rbp
+; CHECK-NEXT: pushq %r15
+; CHECK-NEXT: pushq %r14
+; CHECK-NEXT: pushq %r13
+; CHECK-NEXT: pushq %r12
+; CHECK-NEXT: pushq %rbx
+; CHECK-NEXT: .cfi_offset %rbx, -56
+; CHECK-NEXT: .cfi_offset %r12, -48
+; CHECK-NEXT: .cfi_offset %r13, -40
+; CHECK-NEXT: .cfi_offset %r14, -32
+; CHECK-NEXT: .cfi_offset %r15, -24
+; CHECK-NEXT: movl $0, -96(%rbp)
+; CHECK-NEXT: movl $1, -92(%rbp)
+; CHECK-NEXT: movl $2, -88(%rbp)
+; CHECK-NEXT: movl $3, -84(%rbp)
+; CHECK-NEXT: movl $4, -80(%rbp)
+; CHECK-NEXT: movl $5, -76(%rbp)
+; CHECK-NEXT: movl $6, -72(%rbp)
+; CHECK-NEXT: movl $7, -68(%rbp)
+; CHECK-NEXT: movl $8, -64(%rbp)
+; CHECK-NEXT: movl $9, -60(%rbp)
+; CHECK-NEXT: movl $16, -56(%rbp)
+; CHECK-NEXT: movl $17, -52(%rbp)
+; CHECK-NEXT: movl $18, -48(%rbp)
+; CHECK-NEXT: movl $19, -44(%rbp)
+; CHECK-NEXT: movl -96(%rbp), %eax
+; CHECK-NEXT: movl -92(%rbp), %ecx
+; CHECK-NEXT: movl -88(%rbp), %edx
+; CHECK-NEXT: movl -84(%rbp), %esi
+; CHECK-NEXT: movl -80(%rbp), %edi
+; CHECK-NEXT: movl -76(%rbp), %r8d
+; CHECK-NEXT: movl -72(%rbp), %r9d
+; CHECK-NEXT: movl -68(%rbp), %r10d
+; CHECK-NEXT: movl -64(%rbp), %r11d
+; CHECK-NEXT: movl -60(%rbp), %ebx
+; CHECK-NEXT: movl -56(%rbp), %r14d
+; CHECK-NEXT: movl -52(%rbp), %r15d
+; CHECK-NEXT: movl -48(%rbp), %r12d
+; CHECK-NEXT: movl -44(%rbp), %r13d
+; CHECK-NEXT: #APP
+; CHECK-NEXT: nop
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: movl %eax, -96(%rbp)
+; CHECK-NEXT: movl %ecx, -92(%rbp)
+; CHECK-NEXT: movl %edx, -88(%rbp)
+; CHECK-NEXT: movl %esi, -84(%rbp)
+; CHECK-NEXT: movl %edi, -80(%rbp)
+; CHECK-NEXT: movl %r8d, -76(%rbp)
+; CHECK-NEXT: movl %r9d, -72(%rbp)
+; CHECK-NEXT: movl %r10d, -68(%rbp)
+; CHECK-NEXT: movl %r11d, -64(%rbp)
+; CHECK-NEXT: movl %ebx, -60(%rbp)
+; CHECK-NEXT: movl %r14d, -56(%rbp)
+; CHECK-NEXT: movl %r15d, -52(%rbp)
+; CHECK-NEXT: movl %r12d, -48(%rbp)
+; CHECK-NEXT: movl %r13d, -44(%rbp)
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: popq %rbx
+; CHECK-NEXT: popq %r12
+; CHECK-NEXT: popq %r13
+; CHECK-NEXT: popq %r14
+; CHECK-NEXT: popq %r15
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: .cfi_def_cfa %rsp, 8
+; CHECK-NEXT: retq
+entry:
+ %reg0 = alloca i32, align 4
+ %reg1 = alloca i32, align 4
+ %reg2 = alloca i32, align 4
+ %reg3 = alloca i32, align 4
+ %reg4 = alloca i32, align 4
+ %reg5 = alloca i32, align 4
+ %reg6 = alloca i32, align 4
+ %reg7 = alloca i32, align 4
+ %reg8 = alloca i32, align 4
+ %reg9 = alloca i32, align 4
+ %reg10 = alloca i32, align 4
+ %reg11 = alloca i32, align 4
+ %reg12 = alloca i32, align 4
+ %reg13 = alloca i32, align 4
+ store volatile i32 0, ptr %reg0, align 4
+ store volatile i32 1, ptr %reg1, align 4
+ store volatile i32 2, ptr %reg2, align 4
+ store volatile i32 3, ptr %reg3, align 4
+ store volatile i32 4, ptr %reg4, align 4
+ store volatile i32 5, ptr %reg5, align 4
+ store volatile i32 6, ptr %reg6, align 4
+ store volatile i32 7, ptr %reg7, align 4
+ store volatile i32 8, ptr %reg8, align 4
+ store volatile i32 9, ptr %reg9, align 4
+ store volatile i32 16, ptr %reg10, align 4
+ store volatile i32 17, ptr %reg11, align 4
+ store volatile i32 18, ptr %reg12, align 4
+ store volatile i32 19, ptr %reg13, align 4
+ %0 = load volatile i32, ptr %reg0, align 4
+ %1 = load volatile i32, ptr %reg1, align 4
+ %2 = load volatile i32, ptr %reg2, align 4
+ %3 = load volatile i32, ptr %reg3, align 4
+ %4 = load volatile i32, ptr %reg4, align 4
+ %5 = load volatile i32, ptr %reg5, align 4
+ %6 = load volatile i32, ptr %reg6, align 4
+ %7 = load volatile i32, ptr %reg7, align 4
+ %8 = load volatile i32, ptr %reg8, align 4
+ %9 = load volatile i32, ptr %reg9, align 4
+ %10 = load volatile i32, ptr %reg10, align 4
+ %11 = load volatile i32, ptr %reg11, align 4
+ %12 = load volatile i32, ptr %reg12, align 4
+ %13 = load volatile i32, ptr %reg13, align 4
+ %14 = call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "nop", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,~{dirflag},~{fpsr},~{flags}"(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13) #1
+ %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 0
+ %asmresult1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 1
+ %asmresult2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 2
+ %asmresult3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 3
+ %asmresult4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 4
+ %asmresult5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 5
+ %asmresult6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 6
+ %asmresult7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 7
+ %asmresult8 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 8
+ %asmresult9 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 9
+ %asmresult10 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 10
+ %asmresult11 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 11
+ %asmresult12 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 12
+ %asmresult13 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %14, 13
+ store volatile i32 %asmresult, ptr %reg0, align 4
+ store volatile i32 %asmresult1, ptr %reg1, align 4
+ store volatile i32 %asmresult2, ptr %reg2, align 4
+ store volatile i32 %asmresult3, ptr %reg3, align 4
+ store volatile i32 %asmresult4, ptr %reg4, align 4
+ store volatile i32 %asmresult5, ptr %reg5, align 4
+ store volatile i32 %asmresult6, ptr %reg6, align 4
+ store volatile i32 %asmresult7, ptr %reg7, align 4
+ store volatile i32 %asmresult8, ptr %reg8, align 4
+ store volatile i32 %asmresult9, ptr %reg9, align 4
+ store volatile i32 %asmresult10, ptr %reg10, align 4
+ store volatile i32 %asmresult11, ptr %reg11, align 4
+ store volatile i32 %asmresult12, ptr %reg12, align 4
+ store volatile i32 %asmresult13, ptr %reg13, align 4
+ ret i32 0
+}
diff --git a/llvm/test/CodeGen/X86/rotate-extract.ll b/llvm/test/CodeGen/X86/rotate-extract.ll
index 8f046a4..26e6886 100644
--- a/llvm/test/CodeGen/X86/rotate-extract.ll
+++ b/llvm/test/CodeGen/X86/rotate-extract.ll
@@ -203,10 +203,10 @@ define i16 @no_extract_mul(i16 %i) nounwind {
; X64-LABEL: no_extract_mul:
; X64: # %bb.0:
; X64-NEXT: # kill: def $edi killed $edi def $rdi
-; X64-NEXT: leal (%rdi,%rdi,8), %eax
-; X64-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
+; X64-NEXT: movl %edi, %eax
; X64-NEXT: shll $8, %edi
; X64-NEXT: leal (%rdi,%rdi,8), %ecx
+; X64-NEXT: leal (%rax,%rax,8), %eax
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: shrl $9, %eax
; X64-NEXT: orl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/rounding-ops.ll b/llvm/test/CodeGen/X86/rounding-ops.ll
index 948449c..147663a 100644
--- a/llvm/test/CodeGen/X86/rounding-ops.ll
+++ b/llvm/test/CodeGen/X86/rounding-ops.ll
@@ -60,12 +60,10 @@ define float @test3(float %x) nounwind {
; CHECK-AVX512: ## %bb.0:
; CHECK-AVX512-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0
; CHECK-AVX512-NEXT: retq
- %call = tail call float @nearbyintf(float %x) nounwind readnone
+ %call = tail call float @llvm.nearbyint.f32(float %x) nounwind readnone
ret float %call
}
-declare float @nearbyintf(float) nounwind readnone
-
define double @test4(double %x) nounwind {
; CHECK-SSE-LABEL: test4:
; CHECK-SSE: ## %bb.0:
@@ -81,12 +79,10 @@ define double @test4(double %x) nounwind {
; CHECK-AVX512: ## %bb.0:
; CHECK-AVX512-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0
; CHECK-AVX512-NEXT: retq
- %call = tail call double @nearbyint(double %x) nounwind readnone
+ %call = tail call double @llvm.nearbyint.f64(double %x) nounwind readnone
ret double %call
}
-declare double @nearbyint(double) nounwind readnone
-
define float @test5(float %x) nounwind {
; CHECK-SSE-LABEL: test5:
; CHECK-SSE: ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/scatter-schedule.ll b/llvm/test/CodeGen/X86/scatter-schedule.ll
index 762a050..36bf313 100644
--- a/llvm/test/CodeGen/X86/scatter-schedule.ll
+++ b/llvm/test/CodeGen/X86/scatter-schedule.ll
@@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu"
define void @test(i64 %x272, <16 x ptr> %x335, <16 x i32> %x270) {
; CHECK-LABEL: test:
; CHECK: # %bb.0:
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1}
-; CHECK-NEXT: kxnorw %k0, %k0, %k1
+; CHECK-NEXT: kxnorb %k0, %k0, %k1
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm0
; CHECK-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; CHECK-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index d018c53..23c3e84 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=ANY --check-prefix=NO512 --check-prefix=AVXANY --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ANY --check-prefix=AVXANY --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,NO512,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=CHECK,NO512,SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,NO512,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,NO512,AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=CHECK,AVX,AVX512,AVX512BW
; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization.
@@ -26,13 +26,13 @@ define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) {
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
-; AVXANY-LABEL: ne_i128:
-; AVXANY: # %bb.0:
-; AVXANY-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVXANY-NEXT: xorl %eax, %eax
-; AVXANY-NEXT: vptest %xmm0, %xmm0
-; AVXANY-NEXT: setne %al
-; AVXANY-NEXT: retq
+; AVX-LABEL: ne_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %xmm0, %xmm0
+; AVX-NEXT: setne %al
+; AVX-NEXT: retq
%bcx = bitcast <2 x i64> %x to i128
%bcy = bitcast <2 x i64> %y to i128
%cmp = icmp ne i128 %bcx, %bcy
@@ -58,13 +58,13 @@ define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) {
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
-; AVXANY-LABEL: eq_i128:
-; AVXANY: # %bb.0:
-; AVXANY-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVXANY-NEXT: xorl %eax, %eax
-; AVXANY-NEXT: vptest %xmm0, %xmm0
-; AVXANY-NEXT: sete %al
-; AVXANY-NEXT: retq
+; AVX-LABEL: eq_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %xmm0, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retq
%bcx = bitcast <2 x i64> %x to i128
%bcy = bitcast <2 x i64> %y to i128
%cmp = icmp eq i128 %bcx, %bcy
@@ -722,39 +722,27 @@ define i1 @ne_v4i256(<4 x i256> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovd %eax, %xmm0
-; AVX512-NEXT: shrq $32, %rax
-; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: vpinsrd $2, %r10d, %xmm0, %xmm0
-; AVX512-NEXT: shrq $32, %r10
-; AVX512-NEXT: vpinsrd $3, %r10d, %xmm0, %xmm0
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT: vmovd %r8d, %xmm1
-; AVX512-NEXT: shrq $32, %r8
-; AVX512-NEXT: vpinsrd $1, %r8d, %xmm1, %xmm1
+; AVX512-NEXT: vmovq %r10, %xmm0
+; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r9
-; AVX512-NEXT: vpinsrd $2, %r9d, %xmm1, %xmm1
-; AVX512-NEXT: shrq $32, %r9
-; AVX512-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1
+; AVX512-NEXT: vmovq %r9, %xmm1
+; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT: vmovq %r8, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT: vmovd %edx, %xmm1
-; AVX512-NEXT: shrq $32, %rdx
-; AVX512-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1
; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1
-; AVX512-NEXT: shrq $32, %rcx
-; AVX512-NEXT: vpinsrd $3, %ecx, %xmm1, %xmm1
-; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT: vmovd %edi, %xmm2
-; AVX512-NEXT: shrq $32, %rdi
-; AVX512-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2
+; AVX512-NEXT: vmovq %rcx, %xmm1
+; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT: vpinsrd $2, %esi, %xmm2, %xmm2
-; AVX512-NEXT: shrq $32, %rsi
-; AVX512-NEXT: vpinsrd $3, %esi, %xmm2, %xmm2
+; AVX512-NEXT: vmovq %rsi, %xmm2
+; AVX512-NEXT: orq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT: vmovq %rdi, %xmm3
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -797,17 +785,17 @@ define i32 @ne_i128_pair(ptr %a, ptr %b) {
; SSE41-NEXT: setne %al
; SSE41-NEXT: retq
;
-; AVXANY-LABEL: ne_i128_pair:
-; AVXANY: # %bb.0:
-; AVXANY-NEXT: vmovdqa (%rdi), %xmm0
-; AVXANY-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVXANY-NEXT: vpxor 16(%rsi), %xmm1, %xmm1
-; AVXANY-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; AVXANY-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVXANY-NEXT: xorl %eax, %eax
-; AVXANY-NEXT: vptest %xmm0, %xmm0
-; AVXANY-NEXT: setne %al
-; AVXANY-NEXT: retq
+; AVX-LABEL: ne_i128_pair:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %xmm0, %xmm0
+; AVX-NEXT: setne %al
+; AVX-NEXT: retq
%a0 = load i128, ptr %a
%b0 = load i128, ptr %b
%xor1 = xor i128 %a0, %b0
@@ -851,17 +839,17 @@ define i32 @eq_i128_pair(ptr %a, ptr %b) {
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
-; AVXANY-LABEL: eq_i128_pair:
-; AVXANY: # %bb.0:
-; AVXANY-NEXT: vmovdqa (%rdi), %xmm0
-; AVXANY-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVXANY-NEXT: vpxor 16(%rsi), %xmm1, %xmm1
-; AVXANY-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; AVXANY-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVXANY-NEXT: xorl %eax, %eax
-; AVXANY-NEXT: vptest %xmm0, %xmm0
-; AVXANY-NEXT: sete %al
-; AVXANY-NEXT: retq
+; AVX-LABEL: eq_i128_pair:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: vptest %xmm0, %xmm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: retq
%a0 = load i128, ptr %a
%b0 = load i128, ptr %b
%xor1 = xor i128 %a0, %b0
@@ -1236,90 +1224,90 @@ define i32 @eq_i512_pair(ptr %a, ptr %b) {
; PR41971: Comparison using vector types is not favorable here.
define i1 @eq_i128_args(i128 %a, i128 %b) {
-; ANY-LABEL: eq_i128_args:
-; ANY: # %bb.0:
-; ANY-NEXT: xorq %rcx, %rsi
-; ANY-NEXT: xorq %rdx, %rdi
-; ANY-NEXT: orq %rsi, %rdi
-; ANY-NEXT: sete %al
-; ANY-NEXT: retq
+; CHECK-LABEL: eq_i128_args:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorq %rcx, %rsi
+; CHECK-NEXT: xorq %rdx, %rdi
+; CHECK-NEXT: orq %rsi, %rdi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%r = icmp eq i128 %a, %b
ret i1 %r
}
define i1 @eq_i256_args(i256 %a, i256 %b) {
-; ANY-LABEL: eq_i256_args:
-; ANY: # %bb.0:
-; ANY-NEXT: xorq %r9, %rsi
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
-; ANY-NEXT: orq %rsi, %rcx
-; ANY-NEXT: xorq %r8, %rdi
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
-; ANY-NEXT: orq %rdi, %rdx
-; ANY-NEXT: orq %rcx, %rdx
-; ANY-NEXT: sete %al
-; ANY-NEXT: retq
+; CHECK-LABEL: eq_i256_args:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorq %r9, %rsi
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: orq %rsi, %rcx
+; CHECK-NEXT: xorq %r8, %rdi
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: orq %rdi, %rdx
+; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%r = icmp eq i256 %a, %b
ret i1 %r
}
define i1 @eq_i512_args(i512 %a, i512 %b) {
-; ANY-LABEL: eq_i512_args:
-; ANY: # %bb.0:
-; ANY-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; ANY-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
-; ANY-NEXT: orq %r10, %rcx
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
-; ANY-NEXT: orq %r9, %rsi
-; ANY-NEXT: orq %rcx, %rsi
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
-; ANY-NEXT: orq %rax, %rdx
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
-; ANY-NEXT: orq %r8, %rdi
-; ANY-NEXT: orq %rdx, %rdi
-; ANY-NEXT: orq %rsi, %rdi
-; ANY-NEXT: sete %al
-; ANY-NEXT: retq
+; CHECK-LABEL: eq_i512_args:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: orq %r10, %rcx
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r9
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: orq %r9, %rsi
+; CHECK-NEXT: orq %rcx, %rsi
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: orq %rax, %rdx
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: orq %r8, %rdi
+; CHECK-NEXT: orq %rdx, %rdi
+; CHECK-NEXT: orq %rsi, %rdi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%r = icmp eq i512 %a, %b
ret i1 %r
}
define i1 @eq_i128_op(i128 %a, i128 %b) {
-; ANY-LABEL: eq_i128_op:
-; ANY: # %bb.0:
-; ANY-NEXT: addq $1, %rdi
-; ANY-NEXT: adcq $0, %rsi
-; ANY-NEXT: xorq %rdx, %rdi
-; ANY-NEXT: xorq %rcx, %rsi
-; ANY-NEXT: orq %rdi, %rsi
-; ANY-NEXT: sete %al
-; ANY-NEXT: retq
+; CHECK-LABEL: eq_i128_op:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addq $1, %rdi
+; CHECK-NEXT: adcq $0, %rsi
+; CHECK-NEXT: xorq %rdx, %rdi
+; CHECK-NEXT: xorq %rcx, %rsi
+; CHECK-NEXT: orq %rdi, %rsi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%a2 = add i128 %a, 1
%r = icmp eq i128 %a2, %b
ret i1 %r
}
define i1 @eq_i256_op(i256 %a, i256 %b) {
-; ANY-LABEL: eq_i256_op:
-; ANY: # %bb.0:
-; ANY-NEXT: addq $1, %rdi
-; ANY-NEXT: adcq $0, %rsi
-; ANY-NEXT: adcq $0, %rdx
-; ANY-NEXT: adcq $0, %rcx
-; ANY-NEXT: xorq %r8, %rdi
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
-; ANY-NEXT: orq %rdi, %rdx
-; ANY-NEXT: xorq %r9, %rsi
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
-; ANY-NEXT: orq %rsi, %rcx
-; ANY-NEXT: orq %rdx, %rcx
-; ANY-NEXT: sete %al
-; ANY-NEXT: retq
+; CHECK-LABEL: eq_i256_op:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addq $1, %rdi
+; CHECK-NEXT: adcq $0, %rsi
+; CHECK-NEXT: adcq $0, %rdx
+; CHECK-NEXT: adcq $0, %rcx
+; CHECK-NEXT: xorq %r8, %rdi
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: orq %rdi, %rdx
+; CHECK-NEXT: xorq %r9, %rsi
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: orq %rsi, %rcx
+; CHECK-NEXT: orq %rdx, %rcx
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%a2 = add i256 %a, 1
%r = icmp eq i256 %a2, %b
ret i1 %r
@@ -1356,93 +1344,93 @@ define i1 @eq_i512_op(i512 %a, i512 %b) {
; SSE-NEXT: sete %al
; SSE-NEXT: retq
;
-; AVXANY-LABEL: eq_i512_op:
-; AVXANY: # %bb.0:
-; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVXANY-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVXANY-NEXT: addq $1, %rdi
-; AVXANY-NEXT: adcq $0, %rsi
-; AVXANY-NEXT: adcq $0, %rdx
-; AVXANY-NEXT: adcq $0, %rcx
-; AVXANY-NEXT: adcq $0, %r8
-; AVXANY-NEXT: adcq $0, %r9
-; AVXANY-NEXT: adcq $0, %r10
-; AVXANY-NEXT: adcq $0, %rax
-; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
-; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r9
-; AVXANY-NEXT: orq %rsi, %r9
-; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
-; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
-; AVXANY-NEXT: orq %rcx, %rax
-; AVXANY-NEXT: orq %r9, %rax
-; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
-; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10
-; AVXANY-NEXT: orq %rdx, %r10
-; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %r8
-; AVXANY-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
-; AVXANY-NEXT: orq %r8, %rdi
-; AVXANY-NEXT: orq %r10, %rdi
-; AVXANY-NEXT: orq %rax, %rdi
-; AVXANY-NEXT: sete %al
-; AVXANY-NEXT: retq
+; AVX-LABEL: eq_i512_op:
+; AVX: # %bb.0:
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: addq $1, %rdi
+; AVX-NEXT: adcq $0, %rsi
+; AVX-NEXT: adcq $0, %rdx
+; AVX-NEXT: adcq $0, %rcx
+; AVX-NEXT: adcq $0, %r8
+; AVX-NEXT: adcq $0, %r9
+; AVX-NEXT: adcq $0, %r10
+; AVX-NEXT: adcq $0, %rax
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %r9
+; AVX-NEXT: orq %rsi, %r9
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: orq %rcx, %rax
+; AVX-NEXT: orq %r9, %rax
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %r10
+; AVX-NEXT: orq %rdx, %r10
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %r8
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
+; AVX-NEXT: orq %r8, %rdi
+; AVX-NEXT: orq %r10, %rdi
+; AVX-NEXT: orq %rax, %rdi
+; AVX-NEXT: sete %al
+; AVX-NEXT: retq
%a2 = add i512 %a, 1
%r = icmp eq i512 %a2, %b
ret i1 %r
}
define i1 @eq_i128_load_arg(ptr%p, i128 %b) {
-; ANY-LABEL: eq_i128_load_arg:
-; ANY: # %bb.0:
-; ANY-NEXT: xorq 8(%rdi), %rdx
-; ANY-NEXT: xorq (%rdi), %rsi
-; ANY-NEXT: orq %rdx, %rsi
-; ANY-NEXT: sete %al
-; ANY-NEXT: retq
+; CHECK-LABEL: eq_i128_load_arg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorq 8(%rdi), %rdx
+; CHECK-NEXT: xorq (%rdi), %rsi
+; CHECK-NEXT: orq %rdx, %rsi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%a = load i128, ptr %p
%r = icmp eq i128 %a, %b
ret i1 %r
}
define i1 @eq_i256_load_arg(ptr%p, i256 %b) {
-; ANY-LABEL: eq_i256_load_arg:
-; ANY: # %bb.0:
-; ANY-NEXT: xorq 24(%rdi), %r8
-; ANY-NEXT: xorq 8(%rdi), %rdx
-; ANY-NEXT: orq %r8, %rdx
-; ANY-NEXT: xorq 16(%rdi), %rcx
-; ANY-NEXT: xorq (%rdi), %rsi
-; ANY-NEXT: orq %rcx, %rsi
-; ANY-NEXT: orq %rdx, %rsi
-; ANY-NEXT: sete %al
-; ANY-NEXT: retq
+; CHECK-LABEL: eq_i256_load_arg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xorq 24(%rdi), %r8
+; CHECK-NEXT: xorq 8(%rdi), %rdx
+; CHECK-NEXT: orq %r8, %rdx
+; CHECK-NEXT: xorq 16(%rdi), %rcx
+; CHECK-NEXT: xorq (%rdi), %rsi
+; CHECK-NEXT: orq %rcx, %rsi
+; CHECK-NEXT: orq %rdx, %rsi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%a = load i256, ptr %p
%r = icmp eq i256 %a, %b
ret i1 %r
}
define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
-; ANY-LABEL: eq_i512_load_arg:
-; ANY: # %bb.0:
-; ANY-NEXT: movq 40(%rdi), %rax
-; ANY-NEXT: movq 48(%rdi), %r10
-; ANY-NEXT: movq 56(%rdi), %r11
-; ANY-NEXT: xorq 24(%rdi), %r8
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r11
-; ANY-NEXT: orq %r8, %r11
-; ANY-NEXT: xorq 8(%rdi), %rdx
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %rax
-; ANY-NEXT: orq %rdx, %rax
-; ANY-NEXT: orq %r11, %rax
-; ANY-NEXT: xorq 32(%rdi), %r9
-; ANY-NEXT: xorq (%rdi), %rsi
-; ANY-NEXT: orq %r9, %rsi
-; ANY-NEXT: xorq 16(%rdi), %rcx
-; ANY-NEXT: xorq {{[0-9]+}}(%rsp), %r10
-; ANY-NEXT: orq %rcx, %r10
-; ANY-NEXT: orq %rsi, %r10
-; ANY-NEXT: orq %rax, %r10
-; ANY-NEXT: sete %al
-; ANY-NEXT: retq
+; CHECK-LABEL: eq_i512_load_arg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq 40(%rdi), %rax
+; CHECK-NEXT: movq 48(%rdi), %r10
+; CHECK-NEXT: movq 56(%rdi), %r11
+; CHECK-NEXT: xorq 24(%rdi), %r8
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r11
+; CHECK-NEXT: orq %r8, %r11
+; CHECK-NEXT: xorq 8(%rdi), %rdx
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: orq %rdx, %rax
+; CHECK-NEXT: orq %r11, %rax
+; CHECK-NEXT: xorq 32(%rdi), %r9
+; CHECK-NEXT: xorq (%rdi), %rsi
+; CHECK-NEXT: orq %r9, %rsi
+; CHECK-NEXT: xorq 16(%rdi), %rcx
+; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: orq %rcx, %r10
+; CHECK-NEXT: orq %rsi, %r10
+; CHECK-NEXT: orq %rax, %r10
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: retq
%a = load i512, ptr %p
%r = icmp eq i512 %a, %b
ret i1 %r
@@ -1451,12 +1439,12 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
; Tests for any/allbits from memory.
define i1 @anybits_i128_load_arg(ptr %w) {
-; ANY-LABEL: anybits_i128_load_arg:
-; ANY: # %bb.0:
-; ANY-NEXT: movq (%rdi), %rax
-; ANY-NEXT: orq 8(%rdi), %rax
-; ANY-NEXT: setne %al
-; ANY-NEXT: retq
+; CHECK-LABEL: anybits_i128_load_arg:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq (%rdi), %rax
+; CHECK-NEXT: orq 8(%rdi), %rax
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: retq
%ld = load i128, ptr %w
%cmp = icmp ne i128 %ld, 0
ret i1 %cmp
@@ -1480,13 +1468,13 @@ define i1 @allbits_i128_load_arg(ptr %w) {
; SSE41-NEXT: setb %al
; SSE41-NEXT: retq
;
-; AVXANY-LABEL: allbits_i128_load_arg:
-; AVXANY: # %bb.0:
-; AVXANY-NEXT: vmovdqa (%rdi), %xmm0
-; AVXANY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVXANY-NEXT: vptest %xmm1, %xmm0
-; AVXANY-NEXT: setb %al
-; AVXANY-NEXT: retq
+; AVX-LABEL: allbits_i128_load_arg:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vptest %xmm1, %xmm0
+; AVX-NEXT: setb %al
+; AVX-NEXT: retq
%ld = load i128, ptr %w
%cmp = icmp eq i128 %ld, -1
ret i1 %cmp
@@ -1503,13 +1491,13 @@ define i1 @anybits_i256_load_arg(ptr %w) {
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
-; AVXANY-LABEL: anybits_i256_load_arg:
-; AVXANY: # %bb.0:
-; AVXANY-NEXT: vmovdqu (%rdi), %ymm0
-; AVXANY-NEXT: vptest %ymm0, %ymm0
-; AVXANY-NEXT: setne %al
-; AVXANY-NEXT: vzeroupper
-; AVXANY-NEXT: retq
+; AVX-LABEL: anybits_i256_load_arg:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqu (%rdi), %ymm0
+; AVX-NEXT: vptest %ymm0, %ymm0
+; AVX-NEXT: setne %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%ld = load i256, ptr %w
%cmp = icmp ne i256 %ld, 0
ret i1 %cmp
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index 03b61d9..4d341f1 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -1,208 +1,2050 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s -check-prefixes=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+avx512vbmi2 | FileCheck %s -check-prefixes=AVX512VBMI
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s -check-prefixes=ZNVER4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s -check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s -check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s -check-prefixes=CHECK,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi2 | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512VBMI
-; i512 shifts hidden inside 512-bit vectors.
+define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
+; SSE-LABEL: shl_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: andl $56, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: cltq
+; SSE-NEXT: movq -56(%rsp,%rax), %rdx
+; SSE-NEXT: movq -48(%rsp,%rax), %r9
+; SSE-NEXT: movq %r9, %rsi
+; SSE-NEXT: shldq %cl, %rdx, %rsi
+; SSE-NEXT: movq -40(%rsp,%rax), %r10
+; SSE-NEXT: movq %r10, %r8
+; SSE-NEXT: shldq %cl, %r9, %r8
+; SSE-NEXT: movq -32(%rsp,%rax), %r9
+; SSE-NEXT: movq %r9, %r11
+; SSE-NEXT: shldq %cl, %r10, %r11
+; SSE-NEXT: movq -24(%rsp,%rax), %r10
+; SSE-NEXT: movq %r10, %rbx
+; SSE-NEXT: shldq %cl, %r9, %rbx
+; SSE-NEXT: movq -16(%rsp,%rax), %r9
+; SSE-NEXT: movq %r9, %r14
+; SSE-NEXT: shldq %cl, %r10, %r14
+; SSE-NEXT: movq -8(%rsp,%rax), %r10
+; SSE-NEXT: shldq %cl, %r9, %r10
+; SSE-NEXT: movq -64(%rsp,%rax), %rax
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: shlq %cl, %r9
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: movq %r10, 56(%rdi)
+; SSE-NEXT: movq %r14, 48(%rdi)
+; SSE-NEXT: movq %rbx, 40(%rdi)
+; SSE-NEXT: movq %r11, 32(%rdi)
+; SSE-NEXT: movq %r8, 24(%rdi)
+; SSE-NEXT: movq %rsi, 16(%rdi)
+; SSE-NEXT: movq %rdx, 8(%rdi)
+; SSE-NEXT: movq %r9, (%rdi)
+; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: shl_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: andl $56, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: movslq %eax, %r8
+; AVX2-NEXT: movq -56(%rsp,%r8), %rdx
+; AVX2-NEXT: movq -48(%rsp,%r8), %rax
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: shldq %cl, %rdx, %rsi
+; AVX2-NEXT: movq -40(%rsp,%r8), %r10
+; AVX2-NEXT: movq %r10, %r9
+; AVX2-NEXT: shldq %cl, %rax, %r9
+; AVX2-NEXT: movq -32(%rsp,%r8), %rax
+; AVX2-NEXT: movq %rax, %r11
+; AVX2-NEXT: shldq %cl, %r10, %r11
+; AVX2-NEXT: movq -24(%rsp,%r8), %r10
+; AVX2-NEXT: movq %r10, %rbx
+; AVX2-NEXT: shldq %cl, %rax, %rbx
+; AVX2-NEXT: movq -16(%rsp,%r8), %rax
+; AVX2-NEXT: movq %rax, %r14
+; AVX2-NEXT: shldq %cl, %r10, %r14
+; AVX2-NEXT: movq -8(%rsp,%r8), %r10
+; AVX2-NEXT: shldq %cl, %rax, %r10
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT: shlxq %rcx, %rdi, %r8
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %rdi, %rdx
+; AVX2-NEXT: movq %r10, 56(%rax)
+; AVX2-NEXT: movq %r14, 48(%rax)
+; AVX2-NEXT: movq %rbx, 40(%rax)
+; AVX2-NEXT: movq %r11, 32(%rax)
+; AVX2-NEXT: movq %r9, 24(%rax)
+; AVX2-NEXT: movq %rsi, 16(%rax)
+; AVX2-NEXT: movq %rdx, 8(%rax)
+; AVX2-NEXT: movq %r8, (%rax)
+; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shl_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %eax, %ecx
+; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: shrl $3, %eax
+; AVX512F-NEXT: andl $56, %eax
+; AVX512F-NEXT: negl %eax
+; AVX512F-NEXT: movslq %eax, %r8
+; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
+; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT: movq %rax, %rsi
+; AVX512F-NEXT: shldq %cl, %rdx, %rsi
+; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT: movq %r10, %r9
+; AVX512F-NEXT: shldq %cl, %rax, %r9
+; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT: movq %rax, %r11
+; AVX512F-NEXT: shldq %cl, %r10, %r11
+; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT: movq %r10, %rbx
+; AVX512F-NEXT: shldq %cl, %rax, %rbx
+; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: shldq %cl, %r10, %r14
+; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT: shldq %cl, %rax, %r10
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
+; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512F-NEXT: shldq %cl, %rdi, %rdx
+; AVX512F-NEXT: movq %r10, 56(%rax)
+; AVX512F-NEXT: movq %r14, 48(%rax)
+; AVX512F-NEXT: movq %rbx, 40(%rax)
+; AVX512F-NEXT: movq %r11, 32(%rax)
+; AVX512F-NEXT: movq %r9, 24(%rax)
+; AVX512F-NEXT: movq %rsi, 16(%rax)
+; AVX512F-NEXT: movq %rdx, 8(%rax)
+; AVX512F-NEXT: movq %r8, (%rax)
+; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shl_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %r15
+; AVX512VL-NEXT: pushq %r14
+; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: shrl $3, %eax
+; AVX512VL-NEXT: andl $56, %eax
+; AVX512VL-NEXT: negl %eax
+; AVX512VL-NEXT: movslq %eax, %r9
+; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
+; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT: movq %rax, %rsi
+; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
+; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT: movq %r10, %r8
+; AVX512VL-NEXT: shldq %cl, %rax, %r8
+; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq %r11, %rbx
+; AVX512VL-NEXT: shldq %cl, %r10, %rbx
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT: movq %rdi, %r10
+; AVX512VL-NEXT: shldq %cl, %r11, %r10
+; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT: movq %r14, %r15
+; AVX512VL-NEXT: shldq %cl, %rdi, %r15
+; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT: shldq %cl, %r14, %rdi
+; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
+; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512VL-NEXT: shldq %cl, %r11, %rdx
+; AVX512VL-NEXT: movq %rdi, 56(%rax)
+; AVX512VL-NEXT: movq %r15, 48(%rax)
+; AVX512VL-NEXT: movq %r10, 40(%rax)
+; AVX512VL-NEXT: movq %rbx, 32(%rax)
+; AVX512VL-NEXT: movq %r8, 24(%rax)
+; AVX512VL-NEXT: movq %rsi, 16(%rax)
+; AVX512VL-NEXT: movq %rdx, 8(%rax)
+; AVX512VL-NEXT: movq %r9, (%rax)
+; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512VBMI-LABEL: shl_i512:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %r15
+; AVX512VBMI-NEXT: pushq %r14
+; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movl %eax, %ecx
+; AVX512VBMI-NEXT: andl $63, %ecx
+; AVX512VBMI-NEXT: shrl $3, %eax
+; AVX512VBMI-NEXT: andl $56, %eax
+; AVX512VBMI-NEXT: negl %eax
+; AVX512VBMI-NEXT: movslq %eax, %r9
+; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT: movq %rax, %rsi
+; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
+; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT: movq %r10, %r8
+; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
+; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq %r11, %rbx
+; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: movq %rdi, %r10
+; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
+; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT: movq %r14, %r15
+; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
+; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
+; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
+; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
+; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
+; AVX512VBMI-NEXT: movq %r15, 48(%rax)
+; AVX512VBMI-NEXT: movq %r10, 40(%rax)
+; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
+; AVX512VBMI-NEXT: movq %r8, 24(%rax)
+; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
+; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
+; AVX512VBMI-NEXT: movq %r9, (%rax)
+; AVX512VBMI-NEXT: popq %rbx
+; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vzeroupper
+; AVX512VBMI-NEXT: retq
+ %r = shl i512 %a0, %a1
+ ret i512 %r
+}
-define <8 x i64> @shl_i512_1(<8 x i64> %a) {
-; AVX512VL-LABEL: shl_i512_1:
+define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
+; SSE-LABEL: lshr_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: xorps %xmm1, %xmm1
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: andl $56, %eax
+; SSE-NEXT: movq -112(%rsp,%rax), %rdx
+; SSE-NEXT: movq -120(%rsp,%rax), %r9
+; SSE-NEXT: movq %r9, %rsi
+; SSE-NEXT: shrdq %cl, %rdx, %rsi
+; SSE-NEXT: movq -104(%rsp,%rax), %r8
+; SSE-NEXT: shrdq %cl, %r8, %rdx
+; SSE-NEXT: movq -96(%rsp,%rax), %r10
+; SSE-NEXT: shrdq %cl, %r10, %r8
+; SSE-NEXT: movq -88(%rsp,%rax), %r11
+; SSE-NEXT: shrdq %cl, %r11, %r10
+; SSE-NEXT: movq -80(%rsp,%rax), %rbx
+; SSE-NEXT: shrdq %cl, %rbx, %r11
+; SSE-NEXT: movq -72(%rsp,%rax), %r14
+; SSE-NEXT: shrdq %cl, %r14, %rbx
+; SSE-NEXT: movq -128(%rsp,%rax), %r15
+; SSE-NEXT: shrdq %cl, %r9, %r15
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shrq %cl, %r14
+; SSE-NEXT: movq %r14, 56(%rdi)
+; SSE-NEXT: movq %rbx, 48(%rdi)
+; SSE-NEXT: movq %r11, 40(%rdi)
+; SSE-NEXT: movq %r10, 32(%rdi)
+; SSE-NEXT: movq %r8, 24(%rdi)
+; SSE-NEXT: movq %rdx, 16(%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
+; SSE-NEXT: movq %r15, (%rdi)
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: lshr_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: andl $56, %eax
+; AVX2-NEXT: movq -112(%rsp,%rax), %rdx
+; AVX2-NEXT: movq -120(%rsp,%rax), %r9
+; AVX2-NEXT: movq %r9, %rsi
+; AVX2-NEXT: shrdq %cl, %rdx, %rsi
+; AVX2-NEXT: movq -104(%rsp,%rax), %r8
+; AVX2-NEXT: shrdq %cl, %r8, %rdx
+; AVX2-NEXT: movq -96(%rsp,%rax), %r10
+; AVX2-NEXT: shrdq %cl, %r10, %r8
+; AVX2-NEXT: movq -88(%rsp,%rax), %r11
+; AVX2-NEXT: shrdq %cl, %r11, %r10
+; AVX2-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX2-NEXT: shrdq %cl, %rbx, %r11
+; AVX2-NEXT: movq -128(%rsp,%rax), %r14
+; AVX2-NEXT: movq -72(%rsp,%rax), %r15
+; AVX2-NEXT: shrdq %cl, %r15, %rbx
+; AVX2-NEXT: shrdq %cl, %r9, %r14
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrxq %rcx, %r15, %rcx
+; AVX2-NEXT: movq %rcx, 56(%rdi)
+; AVX2-NEXT: movq %rbx, 48(%rdi)
+; AVX2-NEXT: movq %r11, 40(%rdi)
+; AVX2-NEXT: movq %r10, 32(%rdi)
+; AVX2-NEXT: movq %r8, 24(%rdi)
+; AVX2-NEXT: movq %rdx, 16(%rdi)
+; AVX2-NEXT: movq %rsi, 8(%rdi)
+; AVX2-NEXT: movq %r14, (%rdi)
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: lshr_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r15
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %eax, %ecx
+; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: shrl $3, %eax
+; AVX512F-NEXT: andl $56, %eax
+; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx
+; AVX512F-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512F-NEXT: movq %r9, %rsi
+; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
+; AVX512F-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512F-NEXT: shrdq %cl, %r8, %rdx
+; AVX512F-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512F-NEXT: shrdq %cl, %r10, %r8
+; AVX512F-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512F-NEXT: shrdq %cl, %r11, %r10
+; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512F-NEXT: shrdq %cl, %rbx, %r11
+; AVX512F-NEXT: movq -128(%rsp,%rax), %r14
+; AVX512F-NEXT: movq -72(%rsp,%rax), %r15
+; AVX512F-NEXT: shrdq %cl, %r15, %rbx
+; AVX512F-NEXT: shrdq %cl, %r9, %r14
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: shrxq %rcx, %r15, %rcx
+; AVX512F-NEXT: movq %rcx, 56(%rdi)
+; AVX512F-NEXT: movq %rbx, 48(%rdi)
+; AVX512F-NEXT: movq %r11, 40(%rdi)
+; AVX512F-NEXT: movq %r10, 32(%rdi)
+; AVX512F-NEXT: movq %r8, 24(%rdi)
+; AVX512F-NEXT: movq %rdx, 16(%rdi)
+; AVX512F-NEXT: movq %rsi, 8(%rdi)
+; AVX512F-NEXT: movq %r14, (%rdi)
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: lshr_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm3
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX512VL-NEXT: vpsrlq $63, %xmm4, %xmm4
-; AVX512VL-NEXT: vpaddq %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpor %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpaddq %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT: vpsrlq $63, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512VL-NEXT: vpsrlq $63, %zmm0, %zmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
+; AVX512VL-NEXT: pushq %r15
+; AVX512VL-NEXT: pushq %r14
+; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: shrl $3, %eax
+; AVX512VL-NEXT: andl $56, %eax
+; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx
+; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512VL-NEXT: movq %r9, %rsi
+; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
+; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
+; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512VL-NEXT: shrdq %cl, %r10, %r8
+; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512VL-NEXT: shrdq %cl, %r11, %r10
+; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
+; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14
+; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
+; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15
+; AVX512VL-NEXT: shrdq %cl, %r9, %r15
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
+; AVX512VL-NEXT: movq %rcx, 56(%rdi)
+; AVX512VL-NEXT: movq %rbx, 48(%rdi)
+; AVX512VL-NEXT: movq %r11, 40(%rdi)
+; AVX512VL-NEXT: movq %r10, 32(%rdi)
+; AVX512VL-NEXT: movq %r8, 24(%rdi)
+; AVX512VL-NEXT: movq %rdx, 16(%rdi)
+; AVX512VL-NEXT: movq %rsi, 8(%rdi)
+; AVX512VL-NEXT: movq %r15, (%rdi)
+; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
-; AVX512VBMI-LABEL: shl_i512_1:
+; AVX512VBMI-LABEL: lshr_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3
-; AVX512VBMI-NEXT: vpaddq %xmm0, %xmm0, %xmm4
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMI-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1
-; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VBMI-NEXT: vpshldq $1, %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
+; AVX512VBMI-NEXT: pushq %r15
+; AVX512VBMI-NEXT: pushq %r14
+; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movl %eax, %ecx
+; AVX512VBMI-NEXT: andl $63, %ecx
+; AVX512VBMI-NEXT: shrl $3, %eax
+; AVX512VBMI-NEXT: andl $56, %eax
+; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx
+; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512VBMI-NEXT: movq %r9, %rsi
+; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
+; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx
+; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
+; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
+; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
+; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14
+; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
+; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15
+; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
+; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
+; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
+; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
+; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
+; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
+; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
+; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
+; AVX512VBMI-NEXT: movq %r15, (%rdi)
+; AVX512VBMI-NEXT: popq %rbx
+; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
+ %r = lshr i512 %a0, %a1
+ ret i512 %r
+}
+
+define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
+; SSE-LABEL: ashr_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: sarq $63, %r10
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: andl $56, %eax
+; SSE-NEXT: movq -112(%rsp,%rax), %rdx
+; SSE-NEXT: movq -120(%rsp,%rax), %r9
+; SSE-NEXT: movq %r9, %rsi
+; SSE-NEXT: shrdq %cl, %rdx, %rsi
+; SSE-NEXT: movq -104(%rsp,%rax), %r8
+; SSE-NEXT: shrdq %cl, %r8, %rdx
+; SSE-NEXT: movq -96(%rsp,%rax), %r10
+; SSE-NEXT: shrdq %cl, %r10, %r8
+; SSE-NEXT: movq -88(%rsp,%rax), %r11
+; SSE-NEXT: shrdq %cl, %r11, %r10
+; SSE-NEXT: movq -80(%rsp,%rax), %rbx
+; SSE-NEXT: shrdq %cl, %rbx, %r11
+; SSE-NEXT: movq -72(%rsp,%rax), %r14
+; SSE-NEXT: shrdq %cl, %r14, %rbx
+; SSE-NEXT: movq -128(%rsp,%rax), %r15
+; SSE-NEXT: shrdq %cl, %r9, %r15
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: sarq %cl, %r14
+; SSE-NEXT: movq %r14, 56(%rdi)
+; SSE-NEXT: movq %rbx, 48(%rdi)
+; SSE-NEXT: movq %r11, 40(%rdi)
+; SSE-NEXT: movq %r10, 32(%rdi)
+; SSE-NEXT: movq %r8, 24(%rdi)
+; SSE-NEXT: movq %rdx, 16(%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
+; SSE-NEXT: movq %r15, (%rdi)
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: ashr_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: sarq $63, %r10
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: andl $56, %eax
+; AVX2-NEXT: movq -112(%rsp,%rax), %rdx
+; AVX2-NEXT: movq -120(%rsp,%rax), %r9
+; AVX2-NEXT: movq %r9, %rsi
+; AVX2-NEXT: shrdq %cl, %rdx, %rsi
+; AVX2-NEXT: movq -104(%rsp,%rax), %r8
+; AVX2-NEXT: shrdq %cl, %r8, %rdx
+; AVX2-NEXT: movq -96(%rsp,%rax), %r10
+; AVX2-NEXT: shrdq %cl, %r10, %r8
+; AVX2-NEXT: movq -88(%rsp,%rax), %r11
+; AVX2-NEXT: shrdq %cl, %r11, %r10
+; AVX2-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX2-NEXT: shrdq %cl, %rbx, %r11
+; AVX2-NEXT: movq -128(%rsp,%rax), %r14
+; AVX2-NEXT: movq -72(%rsp,%rax), %r15
+; AVX2-NEXT: shrdq %cl, %r15, %rbx
+; AVX2-NEXT: shrdq %cl, %r9, %r14
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: sarxq %rcx, %r15, %rcx
+; AVX2-NEXT: movq %rcx, 56(%rdi)
+; AVX2-NEXT: movq %rbx, 48(%rdi)
+; AVX2-NEXT: movq %r11, 40(%rdi)
+; AVX2-NEXT: movq %r10, 32(%rdi)
+; AVX2-NEXT: movq %r8, 24(%rdi)
+; AVX2-NEXT: movq %rdx, 16(%rdi)
+; AVX2-NEXT: movq %rsi, 8(%rdi)
+; AVX2-NEXT: movq %r14, (%rdi)
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: ashr_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r15
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: sarq $63, %r10
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %eax, %ecx
+; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: shrl $3, %eax
+; AVX512F-NEXT: andl $56, %eax
+; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx
+; AVX512F-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512F-NEXT: movq %r9, %rsi
+; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
+; AVX512F-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512F-NEXT: shrdq %cl, %r8, %rdx
+; AVX512F-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512F-NEXT: shrdq %cl, %r10, %r8
+; AVX512F-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512F-NEXT: shrdq %cl, %r11, %r10
+; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512F-NEXT: shrdq %cl, %rbx, %r11
+; AVX512F-NEXT: movq -128(%rsp,%rax), %r14
+; AVX512F-NEXT: movq -72(%rsp,%rax), %r15
+; AVX512F-NEXT: shrdq %cl, %r15, %rbx
+; AVX512F-NEXT: shrdq %cl, %r9, %r14
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: sarxq %rcx, %r15, %rcx
+; AVX512F-NEXT: movq %rcx, 56(%rdi)
+; AVX512F-NEXT: movq %rbx, 48(%rdi)
+; AVX512F-NEXT: movq %r11, 40(%rdi)
+; AVX512F-NEXT: movq %r10, 32(%rdi)
+; AVX512F-NEXT: movq %r8, 24(%rdi)
+; AVX512F-NEXT: movq %rdx, 16(%rdi)
+; AVX512F-NEXT: movq %rsi, 8(%rdi)
+; AVX512F-NEXT: movq %r14, (%rdi)
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: ashr_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %r15
+; AVX512VL-NEXT: pushq %r14
+; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: sarq $63, %r10
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: shrl $3, %eax
+; AVX512VL-NEXT: andl $56, %eax
+; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx
+; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512VL-NEXT: movq %r9, %rsi
+; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
+; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
+; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512VL-NEXT: shrdq %cl, %r10, %r8
+; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512VL-NEXT: shrdq %cl, %r11, %r10
+; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
+; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14
+; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
+; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15
+; AVX512VL-NEXT: shrdq %cl, %r9, %r15
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
+; AVX512VL-NEXT: movq %rcx, 56(%rdi)
+; AVX512VL-NEXT: movq %rbx, 48(%rdi)
+; AVX512VL-NEXT: movq %r11, 40(%rdi)
+; AVX512VL-NEXT: movq %r10, 32(%rdi)
+; AVX512VL-NEXT: movq %r8, 24(%rdi)
+; AVX512VL-NEXT: movq %rdx, 16(%rdi)
+; AVX512VL-NEXT: movq %rsi, 8(%rdi)
+; AVX512VL-NEXT: movq %r15, (%rdi)
+; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: retq
;
-; ZNVER4-LABEL: shl_i512_1:
-; ZNVER4: # %bb.0:
-; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm2
-; ZNVER4-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; ZNVER4-NEXT: vpaddq %xmm0, %xmm0, %xmm4
-; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3
-; ZNVER4-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; ZNVER4-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; ZNVER4-NEXT: vpshldq $1, %ymm1, %ymm2, %ymm1
-; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; ZNVER4-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; ZNVER4-NEXT: vpshldq $1, %zmm0, %zmm3, %zmm0
-; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
-; ZNVER4-NEXT: retq
- %d = bitcast <8 x i64> %a to i512
- %s = shl i512 %d, 1
- %r = bitcast i512 %s to <8 x i64>
- ret <8 x i64> %r
+; AVX512VBMI-LABEL: ashr_i512:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %r15
+; AVX512VBMI-NEXT: pushq %r14
+; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: sarq $63, %r10
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movl %eax, %ecx
+; AVX512VBMI-NEXT: andl $63, %ecx
+; AVX512VBMI-NEXT: shrl $3, %eax
+; AVX512VBMI-NEXT: andl $56, %eax
+; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx
+; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9
+; AVX512VBMI-NEXT: movq %r9, %rsi
+; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
+; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8
+; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx
+; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10
+; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
+; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11
+; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
+; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx
+; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
+; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14
+; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
+; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15
+; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
+; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
+; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
+; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
+; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
+; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
+; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
+; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
+; AVX512VBMI-NEXT: movq %r15, (%rdi)
+; AVX512VBMI-NEXT: popq %rbx
+; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: retq
+ %r = ashr i512 %a0, %a1
+ ret i512 %r
+}
+
+define i512 @shl_i512_1(i512 %a0) nounwind {
+; CHECK-LABEL: shl_i512_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; CHECK-NEXT: shldq $1, %rdi, %r10
+; CHECK-NEXT: shldq $1, %r11, %rdi
+; CHECK-NEXT: shldq $1, %r9, %r11
+; CHECK-NEXT: shldq $1, %r8, %r9
+; CHECK-NEXT: shldq $1, %rcx, %r8
+; CHECK-NEXT: shldq $1, %rdx, %rcx
+; CHECK-NEXT: shldq $1, %rsi, %rdx
+; CHECK-NEXT: addq %rsi, %rsi
+; CHECK-NEXT: movq %r10, 56(%rax)
+; CHECK-NEXT: movq %rdi, 48(%rax)
+; CHECK-NEXT: movq %r11, 40(%rax)
+; CHECK-NEXT: movq %r9, 32(%rax)
+; CHECK-NEXT: movq %r8, 24(%rax)
+; CHECK-NEXT: movq %rcx, 16(%rax)
+; CHECK-NEXT: movq %rdx, 8(%rax)
+; CHECK-NEXT: movq %rsi, (%rax)
+; CHECK-NEXT: retq
+ %r = shl i512 %a0, 1
+ ret i512 %r
+}
+
+define i512 @lshr_i512_1(i512 %a0) nounwind {
+; CHECK-LABEL: lshr_i512_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; CHECK-NEXT: shrdq $1, %rdx, %rsi
+; CHECK-NEXT: shrdq $1, %rcx, %rdx
+; CHECK-NEXT: shrdq $1, %r8, %rcx
+; CHECK-NEXT: shrdq $1, %r9, %r8
+; CHECK-NEXT: shrdq $1, %r11, %r9
+; CHECK-NEXT: shrdq $1, %rdi, %r11
+; CHECK-NEXT: shrdq $1, %r10, %rdi
+; CHECK-NEXT: shrq %r10
+; CHECK-NEXT: movq %r10, 56(%rax)
+; CHECK-NEXT: movq %rdi, 48(%rax)
+; CHECK-NEXT: movq %r11, 40(%rax)
+; CHECK-NEXT: movq %r9, 32(%rax)
+; CHECK-NEXT: movq %r8, 24(%rax)
+; CHECK-NEXT: movq %rcx, 16(%rax)
+; CHECK-NEXT: movq %rdx, 8(%rax)
+; CHECK-NEXT: movq %rsi, (%rax)
+; CHECK-NEXT: retq
+ %r = lshr i512 %a0, 1
+ ret i512 %r
+}
+
+define i512 @ashr_i512_1(i512 %a0) nounwind {
+; CHECK-LABEL: ashr_i512_1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; CHECK-NEXT: shrdq $1, %rdx, %rsi
+; CHECK-NEXT: shrdq $1, %rcx, %rdx
+; CHECK-NEXT: shrdq $1, %r8, %rcx
+; CHECK-NEXT: shrdq $1, %r9, %r8
+; CHECK-NEXT: shrdq $1, %r11, %r9
+; CHECK-NEXT: shrdq $1, %rdi, %r11
+; CHECK-NEXT: shrdq $1, %r10, %rdi
+; CHECK-NEXT: sarq %r10
+; CHECK-NEXT: movq %r10, 56(%rax)
+; CHECK-NEXT: movq %rdi, 48(%rax)
+; CHECK-NEXT: movq %r11, 40(%rax)
+; CHECK-NEXT: movq %r9, 32(%rax)
+; CHECK-NEXT: movq %r8, 24(%rax)
+; CHECK-NEXT: movq %rcx, 16(%rax)
+; CHECK-NEXT: movq %rdx, 8(%rax)
+; CHECK-NEXT: movq %rsi, (%rax)
+; CHECK-NEXT: retq
+ %r = ashr i512 %a0, 1
+ ret i512 %r
+}
+
+define i512 @shl_i512_200(i512 %a0) nounwind {
+; SSE-LABEL: shl_i512_200:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: shldq $8, %r8, %r9
+; SSE-NEXT: shldq $8, %rcx, %r8
+; SSE-NEXT: shldq $8, %rdx, %rcx
+; SSE-NEXT: shldq $8, %rsi, %rdx
+; SSE-NEXT: shlq $8, %rsi
+; SSE-NEXT: movq %r9, 56(%rdi)
+; SSE-NEXT: movq %r8, 48(%rdi)
+; SSE-NEXT: movq %rcx, 40(%rdi)
+; SSE-NEXT: movq %rdx, 32(%rdi)
+; SSE-NEXT: movq %rsi, 24(%rdi)
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movaps %xmm0, (%rdi)
+; SSE-NEXT: movq $0, 16(%rdi)
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: shl_i512_200:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shldq $8, %r8, %r9
+; AVX2-NEXT: shldq $8, %rcx, %r8
+; AVX2-NEXT: shldq $8, %rdx, %rcx
+; AVX2-NEXT: shldq $8, %rsi, %rdx
+; AVX2-NEXT: shlq $8, %rsi
+; AVX2-NEXT: movq %r9, 56(%rdi)
+; AVX2-NEXT: movq %r8, 48(%rdi)
+; AVX2-NEXT: movq %rcx, 40(%rdi)
+; AVX2-NEXT: movq %rdx, 32(%rdi)
+; AVX2-NEXT: movq %rsi, 24(%rdi)
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovaps %xmm0, (%rdi)
+; AVX2-NEXT: movq $0, 16(%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: shl_i512_200:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: shldq $8, %r8, %r9
+; AVX512-NEXT: shldq $8, %rcx, %r8
+; AVX512-NEXT: shldq $8, %rdx, %rcx
+; AVX512-NEXT: shldq $8, %rsi, %rdx
+; AVX512-NEXT: shlq $8, %rsi
+; AVX512-NEXT: movq %r9, 56(%rdi)
+; AVX512-NEXT: movq %r8, 48(%rdi)
+; AVX512-NEXT: movq %rcx, 40(%rdi)
+; AVX512-NEXT: movq %rdx, 32(%rdi)
+; AVX512-NEXT: movq %rsi, 24(%rdi)
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, (%rdi)
+; AVX512-NEXT: movq $0, 16(%rdi)
+; AVX512-NEXT: retq
+ %r = shl i512 %a0, 200
+ ret i512 %r
+}
+
+define i512 @lshr_i512_200(i512 %a0) nounwind {
+; SSE-LABEL: lshr_i512_200:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT: shrdq $8, %r9, %r8
+; SSE-NEXT: shrdq $8, %rsi, %r9
+; SSE-NEXT: shrdq $8, %rcx, %rsi
+; SSE-NEXT: shrdq $8, %rdx, %rcx
+; SSE-NEXT: shrq $8, %rdx
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, 40(%rdi)
+; SSE-NEXT: movq %rdx, 32(%rdi)
+; SSE-NEXT: movq %rcx, 24(%rdi)
+; SSE-NEXT: movq %rsi, 16(%rdi)
+; SSE-NEXT: movq %r9, 8(%rdi)
+; SSE-NEXT: movq %r8, (%rdi)
+; SSE-NEXT: movq $0, 56(%rdi)
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: lshr_i512_200:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: shrdq $8, %r9, %r8
+; AVX2-NEXT: shrdq $8, %rsi, %r9
+; AVX2-NEXT: shrdq $8, %rcx, %rsi
+; AVX2-NEXT: shrdq $8, %rdx, %rcx
+; AVX2-NEXT: shrq $8, %rdx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %xmm0, 40(%rdi)
+; AVX2-NEXT: movq %rdx, 32(%rdi)
+; AVX2-NEXT: movq %rcx, 24(%rdi)
+; AVX2-NEXT: movq %rsi, 16(%rdi)
+; AVX2-NEXT: movq %r9, 8(%rdi)
+; AVX2-NEXT: movq %r8, (%rdi)
+; AVX2-NEXT: movq $0, 56(%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: lshr_i512_200:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movq %rdi, %rax
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: shrdq $8, %r9, %r8
+; AVX512-NEXT: shrdq $8, %rsi, %r9
+; AVX512-NEXT: shrdq $8, %rcx, %rsi
+; AVX512-NEXT: shrdq $8, %rdx, %rcx
+; AVX512-NEXT: shrq $8, %rdx
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %xmm0, 40(%rdi)
+; AVX512-NEXT: movq %rdx, 32(%rdi)
+; AVX512-NEXT: movq %rcx, 24(%rdi)
+; AVX512-NEXT: movq %rsi, 16(%rdi)
+; AVX512-NEXT: movq %r9, 8(%rdi)
+; AVX512-NEXT: movq %r8, (%rdi)
+; AVX512-NEXT: movq $0, 56(%rdi)
+; AVX512-NEXT: retq
+ %r = lshr i512 %a0, 200
+ ret i512 %r
+}
+
+define i512 @ashr_i512_200(i512 %a0) nounwind {
+; CHECK-LABEL: ashr_i512_200:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT: shrdq $8, %r9, %r8
+; CHECK-NEXT: shrdq $8, %rsi, %r9
+; CHECK-NEXT: shrdq $8, %rcx, %rsi
+; CHECK-NEXT: shrdq $8, %rdx, %rcx
+; CHECK-NEXT: movq %rdx, %rdi
+; CHECK-NEXT: sarq $8, %rdi
+; CHECK-NEXT: sarq $63, %rdx
+; CHECK-NEXT: movq %rdx, 56(%rax)
+; CHECK-NEXT: movq %rdx, 48(%rax)
+; CHECK-NEXT: movq %rdx, 40(%rax)
+; CHECK-NEXT: movq %rdi, 32(%rax)
+; CHECK-NEXT: movq %rcx, 24(%rax)
+; CHECK-NEXT: movq %rsi, 16(%rax)
+; CHECK-NEXT: movq %r9, 8(%rax)
+; CHECK-NEXT: movq %r8, (%rax)
+; CHECK-NEXT: retq
+ %r = ashr i512 %a0, 200
+ ret i512 %r
}
-define <8 x i64> @lshr_i512_1(<8 x i64> %a) {
-; AVX512VL-LABEL: lshr_i512_1:
+define i512 @shl_i512_511(i512 %a0) nounwind {
+; SSE-LABEL: shl_i512_511:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: shlq $63, %rsi
+; SSE-NEXT: movq %rsi, 56(%rdi)
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movaps %xmm0, 32(%rdi)
+; SSE-NEXT: movaps %xmm0, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, (%rdi)
+; SSE-NEXT: movq $0, 48(%rdi)
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: shl_i512_511:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shlq $63, %rsi
+; AVX2-NEXT: movq %rsi, 56(%rdi)
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovaps %xmm0, 32(%rdi)
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, (%rdi)
+; AVX2-NEXT: movq $0, 48(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: shl_i512_511:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: shlq $63, %rsi
+; AVX512F-NEXT: movq %rsi, 56(%rdi)
+; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovaps %xmm0, 32(%rdi)
+; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovups %ymm0, (%rdi)
+; AVX512F-NEXT: movq $0, 48(%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shl_i512_511:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: shlq $63, %rsi
+; AVX512VL-NEXT: movq %rsi, 56(%rdi)
+; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovaps %xmm0, 32(%rdi)
+; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovups %ymm0, (%rdi)
+; AVX512VL-NEXT: movq $0, 48(%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512VBMI-LABEL: shl_i512_511:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: shlq $63, %rsi
+; AVX512VBMI-NEXT: movq %rsi, 56(%rdi)
+; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vmovaps %xmm0, 32(%rdi)
+; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vmovups %ymm0, (%rdi)
+; AVX512VBMI-NEXT: movq $0, 48(%rdi)
+; AVX512VBMI-NEXT: vzeroupper
+; AVX512VBMI-NEXT: retq
+ %r = shl i512 %a0, 511
+ ret i512 %r
+}
+
+define i512 @lshr_i512_511(i512 %a0) nounwind {
+; SSE-LABEL: lshr_i512_511:
+; SSE: # %bb.0:
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: shrq $63, %rcx
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, 40(%rdi)
+; SSE-NEXT: movups %xmm0, 24(%rdi)
+; SSE-NEXT: movups %xmm0, 8(%rdi)
+; SSE-NEXT: movq %rcx, (%rdi)
+; SSE-NEXT: movq $0, 56(%rdi)
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: lshr_i512_511:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: shrq $63, %rcx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %xmm0, 40(%rdi)
+; AVX2-NEXT: movq %rcx, (%rdi)
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, 8(%rdi)
+; AVX2-NEXT: movq $0, 56(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: lshr_i512_511:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT: shrq $63, %rcx
+; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovups %xmm0, 40(%rdi)
+; AVX512F-NEXT: movq %rcx, (%rdi)
+; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovups %ymm0, 8(%rdi)
+; AVX512F-NEXT: movq $0, 56(%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: lshr_i512_511:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm3
-; AVX512VL-NEXT: vpsllq $63, %xmm3, %xmm4
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX512VL-NEXT: vpsrlq $1, %xmm5, %xmm5
-; AVX512VL-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX512VL-NEXT: vpsrlq $1, %xmm3, %xmm3
-; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsllq $63, %ymm1, %ymm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VL-NEXT: vpsrlq $1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovups %xmm0, 40(%rdi)
+; AVX512VL-NEXT: shrq $63, %rcx
+; AVX512VL-NEXT: movq %rcx, (%rdi)
+; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovups %ymm0, 8(%rdi)
+; AVX512VL-NEXT: movq $0, 56(%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
-; AVX512VBMI-LABEL: lshr_i512_1:
+; AVX512VBMI-LABEL: lshr_i512_511:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512VBMI-NEXT: vpsrlq $1, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
-; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vmovups %xmm0, 40(%rdi)
+; AVX512VBMI-NEXT: shrq $63, %rcx
+; AVX512VBMI-NEXT: movq %rcx, (%rdi)
+; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vmovups %ymm0, 8(%rdi)
+; AVX512VBMI-NEXT: movq $0, 56(%rdi)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
+ %r = lshr i512 %a0, 511
+ ret i512 %r
+}
+
+define i512 @ashr_i512_511(i512 %a0) nounwind {
+; CHECK-LABEL: ashr_i512_511:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: sarq $63, %rcx
+; CHECK-NEXT: movq %rcx, 56(%rdi)
+; CHECK-NEXT: movq %rcx, 48(%rdi)
+; CHECK-NEXT: movq %rcx, 40(%rdi)
+; CHECK-NEXT: movq %rcx, 32(%rdi)
+; CHECK-NEXT: movq %rcx, 24(%rdi)
+; CHECK-NEXT: movq %rcx, 16(%rdi)
+; CHECK-NEXT: movq %rcx, 8(%rdi)
+; CHECK-NEXT: movq %rcx, (%rdi)
+; CHECK-NEXT: retq
+ %r = ashr i512 %a0, 511
+ ret i512 %r
+}
+
+define i512 @shl_1_i512(i512 %a0) nounwind {
+; SSE-LABEL: shl_1_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %rax
+; SSE-NEXT: movq -56(%rsp,%rax), %rdx
+; SSE-NEXT: movq -48(%rsp,%rax), %r9
+; SSE-NEXT: movq %r9, %rsi
+; SSE-NEXT: shldq %cl, %rdx, %rsi
+; SSE-NEXT: movq -40(%rsp,%rax), %r10
+; SSE-NEXT: movq %r10, %r8
+; SSE-NEXT: shldq %cl, %r9, %r8
+; SSE-NEXT: movq -32(%rsp,%rax), %r9
+; SSE-NEXT: movq %r9, %r11
+; SSE-NEXT: shldq %cl, %r10, %r11
+; SSE-NEXT: movq -24(%rsp,%rax), %r10
+; SSE-NEXT: movq %r10, %rbx
+; SSE-NEXT: shldq %cl, %r9, %rbx
+; SSE-NEXT: movq -16(%rsp,%rax), %r9
+; SSE-NEXT: movq %r9, %r14
+; SSE-NEXT: shldq %cl, %r10, %r14
+; SSE-NEXT: movq -8(%rsp,%rax), %r10
+; SSE-NEXT: shldq %cl, %r9, %r10
+; SSE-NEXT: movq -64(%rsp,%rax), %rax
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: shlq %cl, %r9
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: movq %r10, 56(%rdi)
+; SSE-NEXT: movq %r14, 48(%rdi)
+; SSE-NEXT: movq %rbx, 40(%rdi)
+; SSE-NEXT: movq %r11, 32(%rdi)
+; SSE-NEXT: movq %r8, 24(%rdi)
+; SSE-NEXT: movq %rsi, 16(%rdi)
+; SSE-NEXT: movq %rdx, 8(%rdi)
+; SSE-NEXT: movq %r9, (%rdi)
+; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: shl_1_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %r8
+; AVX2-NEXT: movq -56(%rsp,%r8), %rdx
+; AVX2-NEXT: movq -48(%rsp,%r8), %rax
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: shldq %cl, %rdx, %rsi
+; AVX2-NEXT: movq -40(%rsp,%r8), %r10
+; AVX2-NEXT: movq %r10, %r9
+; AVX2-NEXT: shldq %cl, %rax, %r9
+; AVX2-NEXT: movq -32(%rsp,%r8), %rax
+; AVX2-NEXT: movq %rax, %r11
+; AVX2-NEXT: shldq %cl, %r10, %r11
+; AVX2-NEXT: movq -24(%rsp,%r8), %r10
+; AVX2-NEXT: movq %r10, %rbx
+; AVX2-NEXT: shldq %cl, %rax, %rbx
+; AVX2-NEXT: movq -16(%rsp,%r8), %rax
+; AVX2-NEXT: movq %rax, %r14
+; AVX2-NEXT: shldq %cl, %r10, %r14
+; AVX2-NEXT: movq -8(%rsp,%r8), %r10
+; AVX2-NEXT: shldq %cl, %rax, %r10
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT: shlxq %rcx, %rdi, %r8
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %rdi, %rdx
+; AVX2-NEXT: movq %r10, 56(%rax)
+; AVX2-NEXT: movq %r14, 48(%rax)
+; AVX2-NEXT: movq %rbx, 40(%rax)
+; AVX2-NEXT: movq %r11, 32(%rax)
+; AVX2-NEXT: movq %r9, 24(%rax)
+; AVX2-NEXT: movq %rsi, 16(%rax)
+; AVX2-NEXT: movq %rdx, 8(%rax)
+; AVX2-NEXT: movq %r8, (%rax)
+; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
-; ZNVER4-LABEL: lshr_i512_1:
-; ZNVER4: # %bb.0:
-; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
-; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
-; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
-; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
-; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; ZNVER4-NEXT: vpsrlq $1, %xmm2, %xmm2
-; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0
-; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; ZNVER4-NEXT: retq
- %d = bitcast <8 x i64> %a to i512
- %s = lshr i512 %d, 1
- %r = bitcast i512 %s to <8 x i64>
- ret <8 x i64> %r
+; AVX512F-LABEL: shl_1_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %esi, %ecx
+; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: shrl $3, %esi
+; AVX512F-NEXT: andl $56, %esi
+; AVX512F-NEXT: negl %esi
+; AVX512F-NEXT: movslq %esi, %r8
+; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
+; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT: movq %rax, %rsi
+; AVX512F-NEXT: shldq %cl, %rdx, %rsi
+; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT: movq %r10, %r9
+; AVX512F-NEXT: shldq %cl, %rax, %r9
+; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT: movq %rax, %r11
+; AVX512F-NEXT: shldq %cl, %r10, %r11
+; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT: movq %r10, %rbx
+; AVX512F-NEXT: shldq %cl, %rax, %rbx
+; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT: movq %rax, %r14
+; AVX512F-NEXT: shldq %cl, %r10, %r14
+; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT: shldq %cl, %rax, %r10
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
+; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512F-NEXT: shldq %cl, %rdi, %rdx
+; AVX512F-NEXT: movq %r10, 56(%rax)
+; AVX512F-NEXT: movq %r14, 48(%rax)
+; AVX512F-NEXT: movq %rbx, 40(%rax)
+; AVX512F-NEXT: movq %r11, 32(%rax)
+; AVX512F-NEXT: movq %r9, 24(%rax)
+; AVX512F-NEXT: movq %rsi, 16(%rax)
+; AVX512F-NEXT: movq %rdx, 8(%rax)
+; AVX512F-NEXT: movq %r8, (%rax)
+; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: shl_1_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %r15
+; AVX512VL-NEXT: pushq %r14
+; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: shrl $3, %esi
+; AVX512VL-NEXT: andl $56, %esi
+; AVX512VL-NEXT: negl %esi
+; AVX512VL-NEXT: movslq %esi, %r9
+; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
+; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT: movq %rax, %rsi
+; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
+; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT: movq %r10, %r8
+; AVX512VL-NEXT: shldq %cl, %rax, %r8
+; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq %r11, %rbx
+; AVX512VL-NEXT: shldq %cl, %r10, %rbx
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT: movq %rdi, %r10
+; AVX512VL-NEXT: shldq %cl, %r11, %r10
+; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
+; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT: movq %r14, %r15
+; AVX512VL-NEXT: shldq %cl, %rdi, %r15
+; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT: shldq %cl, %r14, %rdi
+; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
+; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512VL-NEXT: shldq %cl, %r11, %rdx
+; AVX512VL-NEXT: movq %rdi, 56(%rax)
+; AVX512VL-NEXT: movq %r15, 48(%rax)
+; AVX512VL-NEXT: movq %r10, 40(%rax)
+; AVX512VL-NEXT: movq %rbx, 32(%rax)
+; AVX512VL-NEXT: movq %r8, 24(%rax)
+; AVX512VL-NEXT: movq %rsi, 16(%rax)
+; AVX512VL-NEXT: movq %rdx, 8(%rax)
+; AVX512VL-NEXT: movq %r9, (%rax)
+; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512VBMI-LABEL: shl_1_i512:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %r15
+; AVX512VBMI-NEXT: pushq %r14
+; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: andl $63, %ecx
+; AVX512VBMI-NEXT: shrl $3, %esi
+; AVX512VBMI-NEXT: andl $56, %esi
+; AVX512VBMI-NEXT: negl %esi
+; AVX512VBMI-NEXT: movslq %esi, %r9
+; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT: movq %rax, %rsi
+; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
+; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT: movq %r10, %r8
+; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
+; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq %r11, %rbx
+; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: movq %rdi, %r10
+; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
+; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT: movq %r14, %r15
+; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
+; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
+; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
+; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
+; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
+; AVX512VBMI-NEXT: movq %r15, 48(%rax)
+; AVX512VBMI-NEXT: movq %r10, 40(%rax)
+; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
+; AVX512VBMI-NEXT: movq %r8, 24(%rax)
+; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
+; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
+; AVX512VBMI-NEXT: movq %r9, (%rax)
+; AVX512VBMI-NEXT: popq %rbx
+; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vzeroupper
+; AVX512VBMI-NEXT: retq
+ %r = shl i512 1, %a0
+ ret i512 %r
}
-define <8 x i64> @ashr_i512_1(<8 x i64> %a) {
-; AVX512VL-LABEL: ashr_i512_1:
+define i512 @lshr_signbit_i512(i512 %a0) nounwind {
+; SSE-LABEL: lshr_signbit_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: movq -112(%rsp,%rsi), %rdx
+; SSE-NEXT: movq -120(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, %r8
+; SSE-NEXT: shrdq %cl, %rdx, %r8
+; SSE-NEXT: movq -104(%rsp,%rsi), %r9
+; SSE-NEXT: shrdq %cl, %r9, %rdx
+; SSE-NEXT: movq -96(%rsp,%rsi), %r10
+; SSE-NEXT: shrdq %cl, %r10, %r9
+; SSE-NEXT: movq -88(%rsp,%rsi), %r11
+; SSE-NEXT: shrdq %cl, %r11, %r10
+; SSE-NEXT: movq -80(%rsp,%rsi), %rbx
+; SSE-NEXT: shrdq %cl, %rbx, %r11
+; SSE-NEXT: movq -72(%rsp,%rsi), %r14
+; SSE-NEXT: shrdq %cl, %r14, %rbx
+; SSE-NEXT: movq -128(%rsp,%rsi), %rsi
+; SSE-NEXT: shrdq %cl, %rax, %rsi
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shrq %cl, %r14
+; SSE-NEXT: movq %r14, 56(%rdi)
+; SSE-NEXT: movq %rbx, 48(%rdi)
+; SSE-NEXT: movq %r11, 40(%rdi)
+; SSE-NEXT: movq %r10, 32(%rdi)
+; SSE-NEXT: movq %r9, 24(%rdi)
+; SSE-NEXT: movq %rdx, 16(%rdi)
+; SSE-NEXT: movq %r8, 8(%rdi)
+; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: lshr_signbit_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
+; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: movq -112(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: shrdq %cl, %rdx, %r8
+; AVX2-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX2-NEXT: shrdq %cl, %r9, %rdx
+; AVX2-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX2-NEXT: shrdq %cl, %r10, %r9
+; AVX2-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX2-NEXT: shrdq %cl, %r11, %r10
+; AVX2-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX2-NEXT: shrdq %cl, %rbx, %r11
+; AVX2-NEXT: movq -128(%rsp,%rsi), %r14
+; AVX2-NEXT: movq -72(%rsp,%rsi), %rsi
+; AVX2-NEXT: shrdq %cl, %rsi, %rbx
+; AVX2-NEXT: shrdq %cl, %rax, %r14
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT: movq %rcx, 56(%rdi)
+; AVX2-NEXT: movq %rbx, 48(%rdi)
+; AVX2-NEXT: movq %r11, 40(%rdi)
+; AVX2-NEXT: movq %r10, 32(%rdi)
+; AVX2-NEXT: movq %r9, 24(%rdi)
+; AVX2-NEXT: movq %rdx, 16(%rdi)
+; AVX2-NEXT: movq %r8, 8(%rdi)
+; AVX2-NEXT: movq %r14, (%rdi)
+; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: lshr_signbit_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %esi, %ecx
+; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: shrl $3, %esi
+; AVX512F-NEXT: andl $56, %esi
+; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx
+; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512F-NEXT: movq %rax, %r8
+; AVX512F-NEXT: shrdq %cl, %rdx, %r8
+; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512F-NEXT: shrdq %cl, %r9, %rdx
+; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512F-NEXT: shrdq %cl, %r10, %r9
+; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512F-NEXT: shrdq %cl, %r11, %r10
+; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512F-NEXT: shrdq %cl, %rbx, %r11
+; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14
+; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi
+; AVX512F-NEXT: shrdq %cl, %rsi, %rbx
+; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: shrxq %rcx, %rsi, %rcx
+; AVX512F-NEXT: movq %rcx, 56(%rdi)
+; AVX512F-NEXT: movq %rbx, 48(%rdi)
+; AVX512F-NEXT: movq %r11, 40(%rdi)
+; AVX512F-NEXT: movq %r10, 32(%rdi)
+; AVX512F-NEXT: movq %r9, 24(%rdi)
+; AVX512F-NEXT: movq %rdx, 16(%rdi)
+; AVX512F-NEXT: movq %r8, 8(%rdi)
+; AVX512F-NEXT: movq %r14, (%rdi)
+; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: lshr_signbit_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm3
-; AVX512VL-NEXT: vpsllq $63, %xmm3, %xmm4
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX512VL-NEXT: vpsrlq $1, %xmm5, %xmm5
-; AVX512VL-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX512VL-NEXT: vpsraq $1, %xmm3, %xmm3
-; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsllq $63, %ymm1, %ymm1
-; AVX512VL-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VL-NEXT: vpsrlq $1, %ymm2, %ymm2
-; AVX512VL-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm2
-; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VL-NEXT: vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512VL-NEXT: pushq %r14
+; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: shrl $3, %esi
+; AVX512VL-NEXT: andl $56, %esi
+; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx
+; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512VL-NEXT: movq %rax, %r8
+; AVX512VL-NEXT: shrdq %cl, %rdx, %r8
+; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
+; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512VL-NEXT: shrdq %cl, %r10, %r9
+; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512VL-NEXT: shrdq %cl, %r11, %r10
+; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
+; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14
+; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
+; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi
+; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
+; AVX512VL-NEXT: movq %rcx, 56(%rdi)
+; AVX512VL-NEXT: movq %rbx, 48(%rdi)
+; AVX512VL-NEXT: movq %r11, 40(%rdi)
+; AVX512VL-NEXT: movq %r10, 32(%rdi)
+; AVX512VL-NEXT: movq %r9, 24(%rdi)
+; AVX512VL-NEXT: movq %rdx, 16(%rdi)
+; AVX512VL-NEXT: movq %r8, 8(%rdi)
+; AVX512VL-NEXT: movq %rsi, (%rdi)
+; AVX512VL-NEXT: addq $8, %rsp
+; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
-; AVX512VBMI-LABEL: ashr_i512_1:
+; AVX512VBMI-LABEL: lshr_signbit_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512VBMI-NEXT: vpsraq $1, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
-; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512VBMI-NEXT: pushq %r14
+; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: andl $63, %ecx
+; AVX512VBMI-NEXT: shrl $3, %esi
+; AVX512VBMI-NEXT: andl $56, %esi
+; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx
+; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512VBMI-NEXT: movq %rax, %r8
+; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8
+; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
+; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
+; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
+; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
+; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14
+; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
+; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi
+; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
+; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
+; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
+; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
+; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
+; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
+; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
+; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
+; AVX512VBMI-NEXT: movq %rsi, (%rdi)
+; AVX512VBMI-NEXT: addq $8, %rsp
+; AVX512VBMI-NEXT: popq %rbx
+; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
+ %s = shl i512 1, 511
+ %r = lshr i512 %s, %a0
+ ret i512 %r
+}
+
+define i512 @ashr_signbit_i512(i512 %a0) nounwind {
+; SSE-LABEL: ashr_signbit_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: pushq %rax
+; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: movq -112(%rsp,%rsi), %rdx
+; SSE-NEXT: movq -120(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, %r8
+; SSE-NEXT: shrdq %cl, %rdx, %r8
+; SSE-NEXT: movq -104(%rsp,%rsi), %r9
+; SSE-NEXT: shrdq %cl, %r9, %rdx
+; SSE-NEXT: movq -96(%rsp,%rsi), %r10
+; SSE-NEXT: shrdq %cl, %r10, %r9
+; SSE-NEXT: movq -88(%rsp,%rsi), %r11
+; SSE-NEXT: shrdq %cl, %r11, %r10
+; SSE-NEXT: movq -80(%rsp,%rsi), %rbx
+; SSE-NEXT: shrdq %cl, %rbx, %r11
+; SSE-NEXT: movq -72(%rsp,%rsi), %r14
+; SSE-NEXT: shrdq %cl, %r14, %rbx
+; SSE-NEXT: movq -128(%rsp,%rsi), %rsi
+; SSE-NEXT: shrdq %cl, %rax, %rsi
+; SSE-NEXT: movq %rdi, %rax
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: sarq %cl, %r14
+; SSE-NEXT: movq %r14, 56(%rdi)
+; SSE-NEXT: movq %rbx, 48(%rdi)
+; SSE-NEXT: movq %r11, 40(%rdi)
+; SSE-NEXT: movq %r10, 32(%rdi)
+; SSE-NEXT: movq %r9, 24(%rdi)
+; SSE-NEXT: movq %rdx, 16(%rdi)
+; SSE-NEXT: movq %r8, 8(%rdi)
+; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: addq $8, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: ashr_signbit_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: movq -112(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: shrdq %cl, %rdx, %r8
+; AVX2-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX2-NEXT: shrdq %cl, %r9, %rdx
+; AVX2-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX2-NEXT: shrdq %cl, %r10, %r9
+; AVX2-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX2-NEXT: shrdq %cl, %r11, %r10
+; AVX2-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX2-NEXT: shrdq %cl, %rbx, %r11
+; AVX2-NEXT: movq -128(%rsp,%rsi), %r14
+; AVX2-NEXT: movq -72(%rsp,%rsi), %rsi
+; AVX2-NEXT: shrdq %cl, %rsi, %rbx
+; AVX2-NEXT: shrdq %cl, %rax, %r14
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: sarxq %rcx, %rsi, %rcx
+; AVX2-NEXT: movq %rcx, 56(%rdi)
+; AVX2-NEXT: movq %rbx, 48(%rdi)
+; AVX2-NEXT: movq %r11, 40(%rdi)
+; AVX2-NEXT: movq %r10, 32(%rdi)
+; AVX2-NEXT: movq %r9, 24(%rdi)
+; AVX2-NEXT: movq %rdx, 16(%rdi)
+; AVX2-NEXT: movq %r8, 8(%rdi)
+; AVX2-NEXT: movq %r14, (%rdi)
+; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
-; ZNVER4-LABEL: ashr_i512_1:
-; ZNVER4: # %bb.0:
-; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm1
-; ZNVER4-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; ZNVER4-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; ZNVER4-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; ZNVER4-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
-; ZNVER4-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
-; ZNVER4-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; ZNVER4-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
-; ZNVER4-NEXT: vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; ZNVER4-NEXT: vpsraq $1, %xmm2, %xmm2
-; ZNVER4-NEXT: vpshldq $63, %zmm0, %zmm3, %zmm0
-; ZNVER4-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
-; ZNVER4-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; ZNVER4-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; ZNVER4-NEXT: retq
- %d = bitcast <8 x i64> %a to i512
- %s = ashr i512 %d, 1
- %r = bitcast i512 %s to <8 x i64>
- ret <8 x i64> %r
+; AVX512F-LABEL: ashr_signbit_i512:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: pushq %r14
+; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = -1
+; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %esi, %ecx
+; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: shrl $3, %esi
+; AVX512F-NEXT: andl $56, %esi
+; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx
+; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512F-NEXT: movq %rax, %r8
+; AVX512F-NEXT: shrdq %cl, %rdx, %r8
+; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512F-NEXT: shrdq %cl, %r9, %rdx
+; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512F-NEXT: shrdq %cl, %r10, %r9
+; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512F-NEXT: shrdq %cl, %r11, %r10
+; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512F-NEXT: shrdq %cl, %rbx, %r11
+; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14
+; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi
+; AVX512F-NEXT: shrdq %cl, %rsi, %rbx
+; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: sarxq %rcx, %rsi, %rcx
+; AVX512F-NEXT: movq %rcx, 56(%rdi)
+; AVX512F-NEXT: movq %rbx, 48(%rdi)
+; AVX512F-NEXT: movq %r11, 40(%rdi)
+; AVX512F-NEXT: movq %r10, 32(%rdi)
+; AVX512F-NEXT: movq %r9, 24(%rdi)
+; AVX512F-NEXT: movq %rdx, 16(%rdi)
+; AVX512F-NEXT: movq %r8, 8(%rdi)
+; AVX512F-NEXT: movq %r14, (%rdi)
+; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: ashr_signbit_i512:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: pushq %r14
+; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: shrl $3, %esi
+; AVX512VL-NEXT: andl $56, %esi
+; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx
+; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512VL-NEXT: movq %rax, %r8
+; AVX512VL-NEXT: shrdq %cl, %rdx, %r8
+; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
+; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512VL-NEXT: shrdq %cl, %r10, %r9
+; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512VL-NEXT: shrdq %cl, %r11, %r10
+; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
+; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14
+; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
+; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi
+; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
+; AVX512VL-NEXT: movq %rcx, 56(%rdi)
+; AVX512VL-NEXT: movq %rbx, 48(%rdi)
+; AVX512VL-NEXT: movq %r11, 40(%rdi)
+; AVX512VL-NEXT: movq %r10, 32(%rdi)
+; AVX512VL-NEXT: movq %r9, 24(%rdi)
+; AVX512VL-NEXT: movq %rdx, 16(%rdi)
+; AVX512VL-NEXT: movq %r8, 8(%rdi)
+; AVX512VL-NEXT: movq %rsi, (%rdi)
+; AVX512VL-NEXT: addq $8, %rsp
+; AVX512VL-NEXT: popq %rbx
+; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512VBMI-LABEL: ashr_signbit_i512:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: pushq %r14
+; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: andl $63, %ecx
+; AVX512VBMI-NEXT: shrl $3, %esi
+; AVX512VBMI-NEXT: andl $56, %esi
+; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx
+; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax
+; AVX512VBMI-NEXT: movq %rax, %r8
+; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8
+; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9
+; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
+; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10
+; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
+; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11
+; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
+; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx
+; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
+; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14
+; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
+; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi
+; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
+; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
+; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
+; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
+; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
+; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
+; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
+; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
+; AVX512VBMI-NEXT: movq %rsi, (%rdi)
+; AVX512VBMI-NEXT: addq $8, %rsp
+; AVX512VBMI-NEXT: popq %rbx
+; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: vzeroupper
+; AVX512VBMI-NEXT: retq
+ %s = shl i512 1, 511
+ %r = ashr i512 %s, %a0
+ ret i512 %r
}
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index ce56283..8cb0327 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -10,10 +10,10 @@ declare <4 x i32> @llvm.smul.fix.v4i32(<4 x i32>, <4 x i32>, i32)
define i32 @func(i32 %x, i32 %y) nounwind {
; X64-LABEL: func:
; X64: # %bb.0:
-; X64-NEXT: movslq %esi, %rax
-; X64-NEXT: movslq %edi, %rcx
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movslq %esi, %rcx
+; X64-NEXT: movslq %edi, %rax
+; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrq $32, %rax
; X64-NEXT: shldl $30, %ecx, %eax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
index 18588aa..fade0f7 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -fp-contract=fast < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -stop-after=finalize-isel 2>&1 | FileCheck %s
declare float @llvm.sqrt.f32(float) #2
@@ -24,17 +24,17 @@ define float @sqrt_ieee_ninf(float %f) #0 {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]]
- ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
+ ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf contract afn VRSQRTSSr killed [[DEF]], [[COPY]]
+ ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
- ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
+ ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool)
- ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
- ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr
- ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr
- ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr
- ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr
- ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr
+ ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]]
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]]
; CHECK-NEXT: [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load (s32) from constant-pool)
@@ -46,7 +46,7 @@ define float @sqrt_ieee_ninf(float %f) #0 {
; CHECK-NEXT: [[COPY5:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]]
; CHECK-NEXT: $xmm0 = COPY [[COPY5]]
; CHECK-NEXT: RET 0, $xmm0
- %call = tail call ninf afn float @llvm.sqrt.f32(float %f)
+ %call = tail call ninf afn contract float @llvm.sqrt.f32(float %f)
ret float %call
}
@@ -71,17 +71,17 @@ define float @sqrt_daz_ninf(float %f) #1 {
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0
; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf afn VRSQRTSSr killed [[DEF]], [[COPY]]
- ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
+ ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = ninf contract afn VRSQRTSSr killed [[DEF]], [[COPY]]
+ ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr
; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool)
- ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
+ ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr
; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool)
- ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
- ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr
- ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr
- ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr
- ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr
- ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr
+ ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr
+ ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf contract afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY killed [[VMULSSrr5]]
; CHECK-NEXT: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS
; CHECK-NEXT: [[VCMPSSrri:%[0-9]+]]:fr32 = nofpexcept VCMPSSrri [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr
@@ -90,7 +90,7 @@ define float @sqrt_daz_ninf(float %f) #1 {
; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY killed [[VPANDNrr]]
; CHECK-NEXT: $xmm0 = COPY [[COPY3]]
; CHECK-NEXT: RET 0, $xmm0
- %call = tail call ninf afn float @llvm.sqrt.f32(float %f)
+ %call = tail call ninf afn contract float @llvm.sqrt.f32(float %f)
ret float %call
}
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index bb7245c..82e840b 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2213,12 +2213,12 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE2-NEXT: movq %rdi, %rax
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,0,41,183,1,1,161,221]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,u,0,u,41,u,183,u,1,u,1,u,161,u,221,u]
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; CHECK-SSE2-NEXT: pand %xmm4, %xmm5
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,103,183,171,61,1,127,183]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,u,103,u,183,u,171,u,61,u,1,u,127,u,183,u]
; CHECK-SSE2-NEXT: pand %xmm4, %xmm6
; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6
; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
@@ -2242,10 +2242,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE2-NEXT: por %xmm7, %xmm5
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,223,205,183,161,1,171,239]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,u,223,u,205,u,183,u,161,u,1,u,171,u,239,u]
; CHECK-SSE2-NEXT: pand %xmm4, %xmm1
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,205,27,241,1,1,1,163]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,u,205,u,27,u,241,u,1,u,1,u,1,u,163,u]
; CHECK-SSE2-NEXT: pand %xmm4, %xmm0
; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -2275,8 +2275,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4
; CHECK-SSE41-NEXT: movq %rdi, %rax
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221]
+; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm0
; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
; CHECK-SSE41-NEXT: pand %xmm5, %xmm0
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6
@@ -2302,8 +2302,8 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1
; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1
-; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm0
-; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0]
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239]
+; CHECK-SSE41-NEXT: pmullw %xmm4, %xmm0
; CHECK-SSE41-NEXT: pand %xmm5, %xmm0
; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239]
; CHECK-SSE41-NEXT: psllw $8, %xmm4
@@ -2341,7 +2341,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm4
-; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0]
+; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm5 # [0,0,0,0,1,1,1,0,1,1,0,1,0,1,0,1]
; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm6 # [0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1]
@@ -2361,7 +2361,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; CHECK-AVX1-NEXT: vpand %xmm5, %xmm4, %xmm4
; CHECK-AVX1-NEXT: vpaddb %xmm4, %xmm6, %xmm4
-; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm6 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0]
+; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm6 # [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60]
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60]
; CHECK-AVX1-NEXT: vpsllw $8, %xmm4, %xmm4
@@ -2375,7 +2375,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
; CHECK-AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpackuswb %xmm6, %xmm7, %xmm6
-; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
+; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm7 # [0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1]
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm7, %xmm7
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8 # [0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1]
; CHECK-AVX1-NEXT: vpsllw $8, %xmm8, %xmm8
@@ -2394,7 +2394,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
; CHECK-AVX1-NEXT: vpand %xmm5, %xmm6, %xmm5
; CHECK-AVX1-NEXT: vpaddb %xmm5, %xmm7, %xmm5
-; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0]
+; CHECK-AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm6 # [3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117]
; CHECK-AVX1-NEXT: vpand %xmm3, %xmm6, %xmm3
; CHECK-AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5 # [0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117]
; CHECK-AVX1-NEXT: vpsllw $8, %xmm5, %xmm5
@@ -2423,7 +2423,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47,0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
; CHECK-AVX2-NEXT: vpsrlw $8, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3
-; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0]
+; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 # [0,0,0,0,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1]
; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm6 # [0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,1,0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1]
@@ -2443,7 +2443,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; CHECK-AVX2-NEXT: vpaddb %ymm3, %ymm4, %ymm3
-; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [13,0,19,0,2,0,2,0,62,0,5,0,97,0,3,0,3,0,7,0,84,0,127,0,114,0,50,0,2,0,97,0]
+; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4 # [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60,3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117]
; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4
; CHECK-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60,0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117]
; CHECK-AVX2-NEXT: vpsllw $8, %ymm3, %ymm3
@@ -2458,7 +2458,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
;
; CHECK-AVX512VL-LABEL: pr51133:
; CHECK-AVX512VL: # %bb.0:
-; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0,171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0]
+; CHECK-AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239,171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221]
; CHECK-AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239,0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221]
; CHECK-AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3
; CHECK-AVX512VL-NEXT: vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
index e936e1e..0fb6eb3 100644
--- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; SSE-LABEL: fold_srem_vec_1:
@@ -55,55 +57,105 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: fold_srem_vec_1:
-; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: movswl %ax, %ecx
-; AVX-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: subl %eax, %ecx
-; AVX-NEXT: movzwl %cx, %ecx
-; AVX-NEXT: movswl %cx, %edx
-; AVX-NEXT: shrl $15, %ecx
-; AVX-NEXT: sarl $9, %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vmovd %xmm0, %ecx
-; AVX-NEXT: movswl %cx, %edx
-; AVX-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77
-; AVX-NEXT: shrl $16, %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: movswl %dx, %esi
-; AVX-NEXT: shrl $15, %edx
-; AVX-NEXT: sarl $6, %esi
-; AVX-NEXT: addl %edx, %esi
-; AVX-NEXT: imull $95, %esi, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vmovd %ecx, %xmm1
-; AVX-NEXT: vpextrw $1, %xmm0, %ecx
-; AVX-NEXT: movswl %cx, %edx
-; AVX-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF
-; AVX-NEXT: movl %edx, %esi
-; AVX-NEXT: shrl $31, %esi
-; AVX-NEXT: sarl $21, %edx
-; AVX-NEXT: addl %esi, %edx
-; AVX-NEXT: imull $-124, %edx, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $2, %xmm0, %ecx
-; AVX-NEXT: movswl %cx, %edx
-; AVX-NEXT: imull $2675, %edx, %edx # imm = 0xA73
-; AVX-NEXT: movl %edx, %esi
-; AVX-NEXT: shrl $31, %esi
-; AVX-NEXT: sarl $18, %edx
-; AVX-NEXT: addl %esi, %edx
-; AVX-NEXT: imull $98, %edx, %edx
-; AVX-NEXT: subl %edx, %ecx
-; AVX-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0
-; AVX-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: fold_srem_vec_1:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax
+; AVX1OR2-NEXT: movswl %ax, %ecx
+; AVX1OR2-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
+; AVX1OR2-NEXT: shrl $16, %ecx
+; AVX1OR2-NEXT: subl %eax, %ecx
+; AVX1OR2-NEXT: movzwl %cx, %ecx
+; AVX1OR2-NEXT: movswl %cx, %edx
+; AVX1OR2-NEXT: shrl $15, %ecx
+; AVX1OR2-NEXT: sarl $9, %edx
+; AVX1OR2-NEXT: addl %ecx, %edx
+; AVX1OR2-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15
+; AVX1OR2-NEXT: subl %ecx, %eax
+; AVX1OR2-NEXT: vmovd %xmm0, %ecx
+; AVX1OR2-NEXT: movswl %cx, %edx
+; AVX1OR2-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77
+; AVX1OR2-NEXT: shrl $16, %edx
+; AVX1OR2-NEXT: addl %ecx, %edx
+; AVX1OR2-NEXT: movzwl %dx, %edx
+; AVX1OR2-NEXT: movswl %dx, %esi
+; AVX1OR2-NEXT: shrl $15, %edx
+; AVX1OR2-NEXT: sarl $6, %esi
+; AVX1OR2-NEXT: addl %edx, %esi
+; AVX1OR2-NEXT: imull $95, %esi, %edx
+; AVX1OR2-NEXT: subl %edx, %ecx
+; AVX1OR2-NEXT: vmovd %ecx, %xmm1
+; AVX1OR2-NEXT: vpextrw $1, %xmm0, %ecx
+; AVX1OR2-NEXT: movswl %cx, %edx
+; AVX1OR2-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF
+; AVX1OR2-NEXT: movl %edx, %esi
+; AVX1OR2-NEXT: shrl $31, %esi
+; AVX1OR2-NEXT: sarl $21, %edx
+; AVX1OR2-NEXT: addl %esi, %edx
+; AVX1OR2-NEXT: imull $-124, %edx, %edx
+; AVX1OR2-NEXT: subl %edx, %ecx
+; AVX1OR2-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpextrw $2, %xmm0, %ecx
+; AVX1OR2-NEXT: movswl %cx, %edx
+; AVX1OR2-NEXT: imull $2675, %edx, %edx # imm = 0xA73
+; AVX1OR2-NEXT: movl %edx, %esi
+; AVX1OR2-NEXT: shrl $31, %esi
+; AVX1OR2-NEXT: sarl $18, %edx
+; AVX1OR2-NEXT: addl %esi, %edx
+; AVX1OR2-NEXT: imull $98, %edx, %edx
+; AVX1OR2-NEXT: subl %edx, %ecx
+; AVX1OR2-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0
+; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: fold_srem_vec_1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrw $3, %xmm0, %eax
+; AVX512-NEXT: movswl %ax, %ecx
+; AVX512-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: subl %eax, %ecx
+; AVX512-NEXT: movzwl %cx, %edx
+; AVX512-NEXT: movswl %dx, %ecx
+; AVX512-NEXT: shrl $15, %edx
+; AVX512-NEXT: sarl $9, %ecx
+; AVX512-NEXT: addl %edx, %ecx
+; AVX512-NEXT: vmovd %xmm0, %edx
+; AVX512-NEXT: movswl %dx, %esi
+; AVX512-NEXT: imull $-21385, %esi, %esi # imm = 0xAC77
+; AVX512-NEXT: shrl $16, %esi
+; AVX512-NEXT: addl %edx, %esi
+; AVX512-NEXT: movzwl %si, %esi
+; AVX512-NEXT: movswl %si, %edi
+; AVX512-NEXT: shrl $15, %esi
+; AVX512-NEXT: sarl $6, %edi
+; AVX512-NEXT: addl %esi, %edi
+; AVX512-NEXT: imull $95, %edi, %esi
+; AVX512-NEXT: subl %esi, %edx
+; AVX512-NEXT: vmovd %edx, %xmm1
+; AVX512-NEXT: vpextrw $1, %xmm0, %edx
+; AVX512-NEXT: movswl %dx, %esi
+; AVX512-NEXT: imull $-16913, %esi, %esi # imm = 0xBDEF
+; AVX512-NEXT: movl %esi, %edi
+; AVX512-NEXT: shrl $31, %edi
+; AVX512-NEXT: sarl $21, %esi
+; AVX512-NEXT: addl %edi, %esi
+; AVX512-NEXT: imull $-1003, %ecx, %ecx # imm = 0xFC15
+; AVX512-NEXT: imull $-124, %esi, %esi
+; AVX512-NEXT: subl %esi, %edx
+; AVX512-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1
+; AVX512-NEXT: vpextrw $2, %xmm0, %edx
+; AVX512-NEXT: subl %ecx, %eax
+; AVX512-NEXT: movswl %dx, %ecx
+; AVX512-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
+; AVX512-NEXT: movl %ecx, %esi
+; AVX512-NEXT: shrl $31, %esi
+; AVX512-NEXT: sarl $18, %ecx
+; AVX512-NEXT: addl %esi, %ecx
+; AVX512-NEXT: imull $98, %ecx, %ecx
+; AVX512-NEXT: subl %ecx, %edx
+; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm0
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512-NEXT: retq
%1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
ret <4 x i16> %1
}
@@ -139,20 +191,35 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; Don't fold if we can combine srem with sdiv.
define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
-; SSE-LABEL: combine_srem_sdiv:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
-; SSE-NEXT: pmulhw %xmm0, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrlw $15, %xmm2
-; SSE-NEXT: psraw $6, %xmm1
-; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
-; SSE-NEXT: pmullw %xmm1, %xmm2
-; SSE-NEXT: psubw %xmm2, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_srem_sdiv:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE2-NEXT: pmulhw %xmm0, %xmm1
+; SSE2-NEXT: paddw %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrlw $15, %xmm2
+; SSE2-NEXT: psraw $6, %xmm1
+; SSE2-NEXT: paddw %xmm2, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
+; SSE2-NEXT: pmullw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: combine_srem_sdiv:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE4-NEXT: pmulhw %xmm0, %xmm1
+; SSE4-NEXT: paddw %xmm0, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm2
+; SSE4-NEXT: psrlw $15, %xmm2
+; SSE4-NEXT: psraw $6, %xmm1
+; SSE4-NEXT: paddw %xmm2, %xmm1
+; SSE4-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
+; SSE4-NEXT: pmullw %xmm1, %xmm2
+; SSE4-NEXT: psubw %xmm2, %xmm0
+; SSE4-NEXT: paddw %xmm1, %xmm0
+; SSE4-NEXT: retq
;
; AVX-LABEL: combine_srem_sdiv:
; AVX: # %bb.0:
@@ -421,48 +488,93 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; Don't fold i64 srem.
define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
-; SSE-LABEL: dont_fold_srem_i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: movq %xmm1, %rcx
-; SSE-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: imulq %rdx
-; SSE-NEXT: addq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %rax
-; SSE-NEXT: shrq $63, %rax
-; SSE-NEXT: sarq $4, %rdx
-; SSE-NEXT: addq %rax, %rdx
-; SSE-NEXT: leaq (%rdx,%rdx,2), %rax
-; SSE-NEXT: shlq $3, %rax
-; SSE-NEXT: subq %rax, %rdx
-; SSE-NEXT: addq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm1
-; SSE-NEXT: pextrq $1, %xmm2, %rcx
-; SSE-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: imulq %rdx
-; SSE-NEXT: movq %rdx, %rax
-; SSE-NEXT: shrq $63, %rax
-; SSE-NEXT: sarq $11, %rdx
-; SSE-NEXT: addq %rax, %rdx
-; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
-; SSE-NEXT: subq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm2
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT: pextrq $1, %xmm0, %rcx
-; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: imulq %rdx
-; SSE-NEXT: movq %rdx, %rax
-; SSE-NEXT: shrq $63, %rax
-; SSE-NEXT: sarq $8, %rdx
-; SSE-NEXT: addq %rax, %rdx
-; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
-; SSE-NEXT: subq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE-NEXT: retq
+; SSE2-LABEL: dont_fold_srem_i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: movq %xmm1, %rcx
+; SSE2-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: imulq %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq $4, %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: leaq (%rdx,%rdx,2), %rax
+; SSE2-NEXT: shlq $3, %rax
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE2-NEXT: movq %xmm2, %rcx
+; SSE2-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: imulq %rdx
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq $11, %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
+; SSE2-NEXT: subq %rax, %rcx
+; SSE2-NEXT: movq %rcx, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT: movq %xmm0, %rcx
+; SSE2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: imulq %rdx
+; SSE2-NEXT: movq %rdx, %rax
+; SSE2-NEXT: shrq $63, %rax
+; SSE2-NEXT: sarq $8, %rdx
+; SSE2-NEXT: addq %rax, %rdx
+; SSE2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
+; SSE2-NEXT: subq %rax, %rcx
+; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: dont_fold_srem_i64:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movdqa %xmm1, %xmm2
+; SSE4-NEXT: movq %xmm1, %rcx
+; SSE4-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
+; SSE4-NEXT: movq %rcx, %rax
+; SSE4-NEXT: imulq %rdx
+; SSE4-NEXT: addq %rcx, %rdx
+; SSE4-NEXT: movq %rdx, %rax
+; SSE4-NEXT: shrq $63, %rax
+; SSE4-NEXT: sarq $4, %rdx
+; SSE4-NEXT: addq %rax, %rdx
+; SSE4-NEXT: leaq (%rdx,%rdx,2), %rax
+; SSE4-NEXT: shlq $3, %rax
+; SSE4-NEXT: subq %rax, %rdx
+; SSE4-NEXT: addq %rcx, %rdx
+; SSE4-NEXT: movq %rdx, %xmm1
+; SSE4-NEXT: pextrq $1, %xmm2, %rcx
+; SSE4-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
+; SSE4-NEXT: movq %rcx, %rax
+; SSE4-NEXT: imulq %rdx
+; SSE4-NEXT: movq %rdx, %rax
+; SSE4-NEXT: shrq $63, %rax
+; SSE4-NEXT: sarq $11, %rdx
+; SSE4-NEXT: addq %rax, %rdx
+; SSE4-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
+; SSE4-NEXT: subq %rax, %rcx
+; SSE4-NEXT: movq %rcx, %xmm2
+; SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE4-NEXT: pextrq $1, %xmm0, %rcx
+; SSE4-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
+; SSE4-NEXT: movq %rcx, %rax
+; SSE4-NEXT: imulq %rdx
+; SSE4-NEXT: movq %rdx, %rax
+; SSE4-NEXT: shrq $63, %rax
+; SSE4-NEXT: sarq $8, %rdx
+; SSE4-NEXT: addq %rax, %rdx
+; SSE4-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
+; SSE4-NEXT: subq %rax, %rcx
+; SSE4-NEXT: movq %rcx, %xmm0
+; SSE4-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE4-NEXT: retq
;
; AVX1-LABEL: dont_fold_srem_i64:
; AVX1: # %bb.0:
@@ -551,6 +663,50 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: dont_fold_srem_i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovq %xmm1, %rcx
+; AVX512-NEXT: movabsq $-5614226457215950491, %rdx # imm = 0xB21642C8590B2165
+; AVX512-NEXT: movq %rcx, %rax
+; AVX512-NEXT: imulq %rdx
+; AVX512-NEXT: addq %rcx, %rdx
+; AVX512-NEXT: movq %rdx, %rax
+; AVX512-NEXT: shrq $63, %rax
+; AVX512-NEXT: sarq $4, %rdx
+; AVX512-NEXT: addq %rax, %rdx
+; AVX512-NEXT: leaq (%rdx,%rdx,2), %rax
+; AVX512-NEXT: shlq $3, %rax
+; AVX512-NEXT: subq %rax, %rdx
+; AVX512-NEXT: addq %rcx, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512-NEXT: vmovq %rdx, %xmm1
+; AVX512-NEXT: movabsq $6966426675817289639, %rdx # imm = 0x60ADB826E5E517A7
+; AVX512-NEXT: movq %rcx, %rax
+; AVX512-NEXT: imulq %rdx
+; AVX512-NEXT: movq %rdx, %rax
+; AVX512-NEXT: shrq $63, %rax
+; AVX512-NEXT: sarq $11, %rdx
+; AVX512-NEXT: addq %rax, %rdx
+; AVX512-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
+; AVX512-NEXT: subq %rax, %rcx
+; AVX512-NEXT: vmovq %rcx, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm2[0]
+; AVX512-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
+; AVX512-NEXT: movq %rcx, %rax
+; AVX512-NEXT: imulq %rdx
+; AVX512-NEXT: movq %rdx, %rax
+; AVX512-NEXT: shrq $63, %rax
+; AVX512-NEXT: sarq $8, %rdx
+; AVX512-NEXT: addq %rax, %rdx
+; AVX512-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
+; AVX512-NEXT: subq %rax, %rcx
+; AVX512-NEXT: vmovq %rcx, %xmm1
+; AVX512-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: retq
%1 = srem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
}
diff --git a/llvm/test/CodeGen/X86/sshl_sat.ll b/llvm/test/CodeGen/X86/sshl_sat.ll
index e5ea911..a93be22 100644
--- a/llvm/test/CodeGen/X86/sshl_sat.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat.ll
@@ -15,16 +15,16 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: movl %edi, %edx
-; X64-NEXT: shll %cl, %edx
-; X64-NEXT: movswl %dx, %esi
+; X64-NEXT: shll %cl, %edi
+; X64-NEXT: movswl %di, %esi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: sarl %cl, %esi
; X64-NEXT: xorl %eax, %eax
-; X64-NEXT: testw %di, %di
+; X64-NEXT: testw %dx, %dx
; X64-NEXT: sets %al
; X64-NEXT: addl $32767, %eax # imm = 0x7FFF
-; X64-NEXT: cmpw %si, %di
-; X64-NEXT: cmovel %edx, %eax
+; X64-NEXT: cmpw %si, %dx
+; X64-NEXT: cmovel %edi, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
@@ -33,17 +33,17 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movswl %si, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movswl %dx, %edi
; X86-NEXT: sarl %cl, %edi
; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: testw %dx, %dx
+; X86-NEXT: testw %si, %si
; X86-NEXT: sets %al
; X86-NEXT: addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT: cmpw %di, %dx
-; X86-NEXT: cmovel %esi, %eax
+; X86-NEXT: cmpw %di, %si
+; X86-NEXT: cmovel %edx, %eax
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -58,18 +58,18 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X64-NEXT: movl %esi, %ecx
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: addl %eax, %eax
-; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: movl %eax, %edx
+; X64-NEXT: xorl %esi, %esi
; X64-NEXT: testw %ax, %ax
-; X64-NEXT: sets %dl
-; X64-NEXT: addl $32767, %edx # imm = 0x7FFF
-; X64-NEXT: movl %eax, %esi
-; X64-NEXT: shll %cl, %esi
-; X64-NEXT: movswl %si, %edi
+; X64-NEXT: sets %sil
+; X64-NEXT: addl $32767, %esi # imm = 0x7FFF
+; X64-NEXT: shll %cl, %eax
+; X64-NEXT: movswl %ax, %edi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: sarl %cl, %edi
-; X64-NEXT: cmpw %di, %ax
-; X64-NEXT: cmovnel %edx, %esi
-; X64-NEXT: movswl %si, %eax
+; X64-NEXT: cmpw %di, %dx
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: cwtl
; X64-NEXT: shrl %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
index 10dee14..ff76707 100644
--- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll
@@ -365,119 +365,118 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $16, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: movswl %bx, %ebp
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movswl %di, %ebp
; X86-NEXT: sarl %cl, %ebp
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testw %di, %di
+; X86-NEXT: testw %bx, %bx
; X86-NEXT: sets %cl
; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT: cmpw %bp, %di
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmovel %ebx, %ecx
+; X86-NEXT: cmpw %bp, %bx
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: cmovel %edi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movswl %di, %ebx
-; X86-NEXT: sarl %cl, %ebx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: testw %si, %si
-; X86-NEXT: sets %al
-; X86-NEXT: addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT: cmpw %bx, %si
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmovel %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
; X86-NEXT: shll %cl, %esi
; X86-NEXT: movswl %si, %edi
; X86-NEXT: sarl %cl, %edi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: testw %dx, %dx
-; X86-NEXT: sets %al
-; X86-NEXT: addl $32767, %eax # imm = 0x7FFF
-; X86-NEXT: cmpw %di, %dx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: testw %bx, %bx
+; X86-NEXT: sets %cl
+; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: cmpw %di, %bx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmovel %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: cmovel %esi, %ebp
; X86-NEXT: shll %cl, %edx
; X86-NEXT: movswl %dx, %esi
; X86-NEXT: sarl %cl, %esi
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: testw %ax, %ax
+; X86-NEXT: testw %di, %di
; X86-NEXT: sets %bl
; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF
-; X86-NEXT: cmpw %si, %ax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpw %si, %di
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmovel %edx, %ebx
-; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movswl %ax, %edx
+; X86-NEXT: sarl %cl, %edx
+; X86-NEXT: xorl %ecx, %ecx
+; X86-NEXT: testw %si, %si
+; X86-NEXT: sets %cl
+; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF
+; X86-NEXT: cmpw %dx, %si
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: cmovel %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movswl %dx, %esi
-; X86-NEXT: sarl %cl, %esi
+; X86-NEXT: movswl %dx, %eax
+; X86-NEXT: sarl %cl, %eax
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testw %ax, %ax
+; X86-NEXT: testw %si, %si
; X86-NEXT: sets %cl
; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT: cmpw %si, %ax
+; X86-NEXT: cmpw %ax, %si
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmovel %edx, %ecx
-; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill
; X86-NEXT: movl %eax, %edx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movswl %dx, %esi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movswl %ax, %esi
; X86-NEXT: sarl %cl, %esi
; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: testw %ax, %ax
+; X86-NEXT: testw %dx, %dx
; X86-NEXT: sets %bl
; X86-NEXT: addl $32767, %ebx # imm = 0x7FFF
-; X86-NEXT: cmpw %si, %ax
-; X86-NEXT: cmovel %edx, %ebx
+; X86-NEXT: cmpw %si, %dx
+; X86-NEXT: cmovel %eax, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %esi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movswl %si, %edi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movswl %ax, %edi
; X86-NEXT: sarl %cl, %edi
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: testw %ax, %ax
+; X86-NEXT: testw %si, %si
; X86-NEXT: sets %dl
; X86-NEXT: addl $32767, %edx # imm = 0x7FFF
-; X86-NEXT: cmpw %di, %ax
-; X86-NEXT: cmovel %esi, %edx
+; X86-NEXT: cmpw %di, %si
+; X86-NEXT: cmovel %eax, %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %esi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movswl %si, %edi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movswl %ax, %edi
; X86-NEXT: sarl %cl, %edi
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: testw %ax, %ax
+; X86-NEXT: testw %si, %si
; X86-NEXT: sets %cl
; X86-NEXT: addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT: cmpw %di, %ax
-; X86-NEXT: cmovel %esi, %ecx
+; X86-NEXT: cmpw %di, %si
+; X86-NEXT: cmovel %eax, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movw %cx, 14(%eax)
; X86-NEXT: movw %dx, 12(%eax)
; X86-NEXT: movw %bx, 10(%eax)
-; X86-NEXT: movw %bp, 8(%eax)
; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movw %cx, 8(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movw %cx, 6(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movw %cx, 4(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movw %cx, 2(%eax)
+; X86-NEXT: movw %bp, 2(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: movw %cx, (%eax)
; X86-NEXT: addl $16, %esp
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
index cd576b1..345fa0e 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
@@ -4,16 +4,16 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
-define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -23,11 +23,11 @@ define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
ret <4 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -37,11 +37,11 @@ define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
ret <4 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -51,11 +51,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
ret <8 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -65,11 +65,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
ret <8 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -79,11 +79,11 @@ define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
ret <4 x i32> %2
}
-define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -93,11 +93,11 @@ define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+ %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
ret <4 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -107,11 +107,11 @@ define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
ret <8 x i32> %2
}
-define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -121,7 +121,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
- %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+ %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
ret <8 x i32> %2
}
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
index 534352f..47537c8 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
@@ -14,7 +14,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i
declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -26,11 +26,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4
; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8]
; CHECK-NEXT: retq # encoding: [0xc3]
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
ret <4 x i32> %ret
}
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -42,11 +42,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8]
; CHECK-NEXT: retq # encoding: [0xc3]
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
ret <8 x i32> %ret
}
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -58,11 +58,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
; CHECK-NEXT: # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8]
; CHECK-NEXT: retq # encoding: [0xc3]
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+ %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
ret <4 x i32> %ret
}
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -74,7 +74,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
; CHECK-NEXT: # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8]
; CHECK-NEXT: retq # encoding: [0xc3]
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
- %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+ %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
ret <8 x i32> %ret
}
diff --git a/llvm/test/CodeGen/X86/stackmap.ll b/llvm/test/CodeGen/X86/stackmap.ll
index 72406aa..9bf88cb 100644
--- a/llvm/test/CodeGen/X86/stackmap.ll
+++ b/llvm/test/CodeGen/X86/stackmap.ll
@@ -1,7 +1,10 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -terminal-rule=0 | FileCheck %s
;
; Note: Print verbose stackmaps using -debug-only=stackmaps.
+; FIXME: Test should be fixed to produce the correct sized spill with
+; -terminal-rule=0 flag removed
+
; CHECK-LABEL: .section __LLVM_STACKMAPS,__llvm_stackmaps
; CHECK-NEXT: __LLVM_StackMaps:
; Header
@@ -546,8 +549,8 @@ define void @clobberScratch(i32 %a) {
ret void
}
-; A stack frame which needs to be realigned at runtime (to meet alignment
-; criteria for values on the stack) does not have a fixed frame size.
+; A stack frame which needs to be realigned at runtime (to meet alignment
+; criteria for values on the stack) does not have a fixed frame size.
; CHECK-LABEL: .long L{{.*}}-_needsStackRealignment
; CHECK-NEXT: .short 0
; 0 locations
diff --git a/llvm/test/CodeGen/X86/strictfp-inlineasm.ll b/llvm/test/CodeGen/X86/strictfp-inlineasm.ll
new file mode 100644
index 0000000..674c12a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/strictfp-inlineasm.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -verify-machineinstrs | FileCheck %s --check-prefixes=X64
+
+define i32 @foo() strictfp {
+; X86-LABEL: foo:
+; X86: # %bb.0: # %entry
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: #APP
+; X86-NEXT: #NO_APP
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: foo:
+; X64: # %bb.0: # %entry
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: #APP
+; X64-NEXT: #NO_APP
+; X64-NEXT: movl $-1, %eax
+; X64-NEXT: retq
+entry:
+ tail call void asm sideeffect "", "r"(i32 1) #1, !srcloc !0
+ ret i32 -1
+}
+
+
+!0 = !{i64 87}
diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
index 5bd624c..01fbafb 100644
--- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
+++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll
@@ -2429,126 +2429,126 @@ define void @vec384_v3i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p
; SSE2-ONLY: # %bb.0:
; SSE2-ONLY-NEXT: movl (%rdi), %eax
; SSE2-ONLY-NEXT: notl %eax
-; SSE2-ONLY-NEXT: movw %ax, (%rsi)
; SSE2-ONLY-NEXT: movl %eax, %ecx
-; SSE2-ONLY-NEXT: shrl $16, %ecx
-; SSE2-ONLY-NEXT: movb %cl, 2(%rsi)
-; SSE2-ONLY-NEXT: movb %cl, 2(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, (%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 6(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 4(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 10(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 8(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 14(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 12(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 18(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 16(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 22(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 20(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 26(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 24(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 30(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 28(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 34(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 32(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 38(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 36(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 42(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 40(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 46(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 44(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 50(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 48(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 54(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 52(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 58(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 56(%rdx)
-; SSE2-ONLY-NEXT: movb %cl, 62(%rdx)
-; SSE2-ONLY-NEXT: movw %ax, 60(%rdx)
+; SSE2-ONLY-NEXT: movw %ax, (%rsi)
+; SSE2-ONLY-NEXT: shrl $16, %eax
+; SSE2-ONLY-NEXT: movb %al, 2(%rsi)
+; SSE2-ONLY-NEXT: movb %al, 2(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, (%rdx)
+; SSE2-ONLY-NEXT: movb %al, 6(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 4(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 10(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 8(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 14(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 12(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 18(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 16(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 22(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 20(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 26(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 24(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 30(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 28(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 34(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 32(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 38(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 36(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 42(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 40(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 46(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 44(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 50(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 48(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 54(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 52(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 58(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 56(%rdx)
+; SSE2-ONLY-NEXT: movb %al, 62(%rdx)
+; SSE2-ONLY-NEXT: movw %cx, 60(%rdx)
; SSE2-ONLY-NEXT: retq
;
; SSE3-LABEL: vec384_v3i8:
; SSE3: # %bb.0:
; SSE3-NEXT: movl (%rdi), %eax
; SSE3-NEXT: notl %eax
-; SSE3-NEXT: movw %ax, (%rsi)
; SSE3-NEXT: movl %eax, %ecx
-; SSE3-NEXT: shrl $16, %ecx
-; SSE3-NEXT: movb %cl, 2(%rsi)
-; SSE3-NEXT: movb %cl, 2(%rdx)
-; SSE3-NEXT: movw %ax, (%rdx)
-; SSE3-NEXT: movb %cl, 6(%rdx)
-; SSE3-NEXT: movw %ax, 4(%rdx)
-; SSE3-NEXT: movb %cl, 10(%rdx)
-; SSE3-NEXT: movw %ax, 8(%rdx)
-; SSE3-NEXT: movb %cl, 14(%rdx)
-; SSE3-NEXT: movw %ax, 12(%rdx)
-; SSE3-NEXT: movb %cl, 18(%rdx)
-; SSE3-NEXT: movw %ax, 16(%rdx)
-; SSE3-NEXT: movb %cl, 22(%rdx)
-; SSE3-NEXT: movw %ax, 20(%rdx)
-; SSE3-NEXT: movb %cl, 26(%rdx)
-; SSE3-NEXT: movw %ax, 24(%rdx)
-; SSE3-NEXT: movb %cl, 30(%rdx)
-; SSE3-NEXT: movw %ax, 28(%rdx)
-; SSE3-NEXT: movb %cl, 34(%rdx)
-; SSE3-NEXT: movw %ax, 32(%rdx)
-; SSE3-NEXT: movb %cl, 38(%rdx)
-; SSE3-NEXT: movw %ax, 36(%rdx)
-; SSE3-NEXT: movb %cl, 42(%rdx)
-; SSE3-NEXT: movw %ax, 40(%rdx)
-; SSE3-NEXT: movb %cl, 46(%rdx)
-; SSE3-NEXT: movw %ax, 44(%rdx)
-; SSE3-NEXT: movb %cl, 50(%rdx)
-; SSE3-NEXT: movw %ax, 48(%rdx)
-; SSE3-NEXT: movb %cl, 54(%rdx)
-; SSE3-NEXT: movw %ax, 52(%rdx)
-; SSE3-NEXT: movb %cl, 58(%rdx)
-; SSE3-NEXT: movw %ax, 56(%rdx)
-; SSE3-NEXT: movb %cl, 62(%rdx)
-; SSE3-NEXT: movw %ax, 60(%rdx)
+; SSE3-NEXT: movw %ax, (%rsi)
+; SSE3-NEXT: shrl $16, %eax
+; SSE3-NEXT: movb %al, 2(%rsi)
+; SSE3-NEXT: movb %al, 2(%rdx)
+; SSE3-NEXT: movw %cx, (%rdx)
+; SSE3-NEXT: movb %al, 6(%rdx)
+; SSE3-NEXT: movw %cx, 4(%rdx)
+; SSE3-NEXT: movb %al, 10(%rdx)
+; SSE3-NEXT: movw %cx, 8(%rdx)
+; SSE3-NEXT: movb %al, 14(%rdx)
+; SSE3-NEXT: movw %cx, 12(%rdx)
+; SSE3-NEXT: movb %al, 18(%rdx)
+; SSE3-NEXT: movw %cx, 16(%rdx)
+; SSE3-NEXT: movb %al, 22(%rdx)
+; SSE3-NEXT: movw %cx, 20(%rdx)
+; SSE3-NEXT: movb %al, 26(%rdx)
+; SSE3-NEXT: movw %cx, 24(%rdx)
+; SSE3-NEXT: movb %al, 30(%rdx)
+; SSE3-NEXT: movw %cx, 28(%rdx)
+; SSE3-NEXT: movb %al, 34(%rdx)
+; SSE3-NEXT: movw %cx, 32(%rdx)
+; SSE3-NEXT: movb %al, 38(%rdx)
+; SSE3-NEXT: movw %cx, 36(%rdx)
+; SSE3-NEXT: movb %al, 42(%rdx)
+; SSE3-NEXT: movw %cx, 40(%rdx)
+; SSE3-NEXT: movb %al, 46(%rdx)
+; SSE3-NEXT: movw %cx, 44(%rdx)
+; SSE3-NEXT: movb %al, 50(%rdx)
+; SSE3-NEXT: movw %cx, 48(%rdx)
+; SSE3-NEXT: movb %al, 54(%rdx)
+; SSE3-NEXT: movw %cx, 52(%rdx)
+; SSE3-NEXT: movb %al, 58(%rdx)
+; SSE3-NEXT: movw %cx, 56(%rdx)
+; SSE3-NEXT: movb %al, 62(%rdx)
+; SSE3-NEXT: movw %cx, 60(%rdx)
; SSE3-NEXT: retq
;
; SSSE3-ONLY-LABEL: vec384_v3i8:
; SSSE3-ONLY: # %bb.0:
; SSSE3-ONLY-NEXT: movl (%rdi), %eax
; SSSE3-ONLY-NEXT: notl %eax
-; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
; SSSE3-ONLY-NEXT: movl %eax, %ecx
-; SSSE3-ONLY-NEXT: shrl $16, %ecx
-; SSSE3-ONLY-NEXT: movb %cl, 2(%rsi)
-; SSSE3-ONLY-NEXT: movb %cl, 2(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, (%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 6(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 4(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 10(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 8(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 14(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 12(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 18(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 16(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 22(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 20(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 26(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 24(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 30(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 28(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 34(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 32(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 38(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 36(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 42(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 40(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 46(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 44(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 50(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 48(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 54(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 52(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 58(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 56(%rdx)
-; SSSE3-ONLY-NEXT: movb %cl, 62(%rdx)
-; SSSE3-ONLY-NEXT: movw %ax, 60(%rdx)
+; SSSE3-ONLY-NEXT: movw %ax, (%rsi)
+; SSSE3-ONLY-NEXT: shrl $16, %eax
+; SSSE3-ONLY-NEXT: movb %al, 2(%rsi)
+; SSSE3-ONLY-NEXT: movb %al, 2(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, (%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 6(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 4(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 10(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 8(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 14(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 12(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 18(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 16(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 22(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 20(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 26(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 24(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 30(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 28(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 34(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 32(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 38(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 36(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 42(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 40(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 46(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 44(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 50(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 48(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 54(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 52(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 58(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 56(%rdx)
+; SSSE3-ONLY-NEXT: movb %al, 62(%rdx)
+; SSSE3-ONLY-NEXT: movw %cx, 60(%rdx)
; SSSE3-ONLY-NEXT: retq
;
; SSE41-LABEL: vec384_v3i8:
diff --git a/llvm/test/CodeGen/X86/twoaddr-lea.ll b/llvm/test/CodeGen/X86/twoaddr-lea.ll
index f20b777..3ad3e9a 100644
--- a/llvm/test/CodeGen/X86/twoaddr-lea.ll
+++ b/llvm/test/CodeGen/X86/twoaddr-lea.ll
@@ -65,10 +65,10 @@ entry:
define void @ham() {
; CHECK-LABEL: ham:
; CHECK: ## %bb.0: ## %bb
+; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %ecx, %ecx
; CHECK-NEXT: movq _global@GOTPCREL(%rip), %rdx
; CHECK-NEXT: movq _global2@GOTPCREL(%rip), %rsi
-; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %cl, %cl
; CHECK-NEXT: je LBB3_2
; CHECK-NEXT: .p2align 4
diff --git a/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
new file mode 100644
index 0000000..6739be5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/umin-sub-to-usubo-select-combine.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s
+
+; GitHub issue #161036
+
+; Positive test : umin(sub(a,b),a) with scalar types should be folded
+define i64 @underflow_compare_fold_i64(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: cmovaeq %rdi, %rax
+; CHECK-NEXT: retq
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+; Positive test : umin(a,sub(a,b)) with scalar types should be folded
+define i64 @underflow_compare_fold_i64_commute(i64 %a, i64 %b) {
+; CHECK-LABEL: underflow_compare_fold_i64_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: cmpq %rax, %rdi
+; CHECK-NEXT: cmovbq %rdi, %rax
+; CHECK-NEXT: retq
+ %sub = sub i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %a, i64 %sub)
+ ret i64 %cond
+}
+
+; Positive test : multi-use is OK since the sub instruction still runs once
+define i64 @underflow_compare_fold_i64_multi_use(i64 %a, i64 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i64_multi_use:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movq %rdi, %rax
+; CHECK-NEXT: subq %rsi, %rax
+; CHECK-NEXT: movq %rax, (%rdx)
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: cmovaeq %rdi, %rax
+; CHECK-NEXT: retq
+ %sub = sub i64 %a, %b
+ store i64 %sub, ptr addrspace(1) %ptr
+ %cond = call i64 @llvm.umin.i64(i64 %sub, i64 %a)
+ ret i64 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: cmpl %edi, %eax
+; CHECK-NEXT: cmovael %edi, %eax
+; CHECK-NEXT: retq
+ %sub = sub i32 %a, %b
+ %cond = tail call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_commute(i32 %a, i32 %b) {
+; CHECK-LABEL: underflow_compare_fold_i32_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: cmpl %eax, %edi
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: retq
+ %sub = sub i32 %a, %b
+ %cond = tail call i32 @llvm.umin.i32(i32 %a, i32 %sub)
+ ret i32 %cond
+}
+
+; Positive test : i32
+define i32 @underflow_compare_fold_i32_multi_use(i32 %a, i32 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i32_multi_use:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: movl %eax, (%rdx)
+; CHECK-NEXT: cmpl %edi, %eax
+; CHECK-NEXT: cmovael %edi, %eax
+; CHECK-NEXT: retq
+ %sub = sub i32 %a, %b
+ store i32 %sub, ptr addrspace(1) %ptr
+ %cond = call i32 @llvm.umin.i32(i32 %sub, i32 %a)
+ ret i32 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: cmpw %di, %ax
+; CHECK-NEXT: cmovael %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %sub = sub i16 %a, %b
+ %cond = tail call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_commute(i16 %a, i16 %b) {
+; CHECK-LABEL: underflow_compare_fold_i16_commute:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: cmpw %ax, %di
+; CHECK-NEXT: cmovbl %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %sub = sub i16 %a, %b
+ %cond = tail call i16 @llvm.umin.i16(i16 %a, i16 %sub)
+ ret i16 %cond
+}
+
+; Positive test : i16
+define i16 @underflow_compare_fold_i16_multi_use(i16 %a, i16 %b, ptr addrspace(1) %ptr) {
+; CHECK-LABEL: underflow_compare_fold_i16_multi_use:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: subl %esi, %eax
+; CHECK-NEXT: movw %ax, (%rdx)
+; CHECK-NEXT: cmpw %di, %ax
+; CHECK-NEXT: cmovael %edi, %eax
+; CHECK-NEXT: # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT: retq
+ %sub = sub i16 %a, %b
+ store i16 %sub, ptr addrspace(1) %ptr
+ %cond = call i16 @llvm.umin.i16(i16 %sub, i16 %a)
+ ret i16 %cond
+}
+
+
+; Negative test, vector types : umin(sub(a,b),a) but with vectors
+define <16 x i8> @underflow_compare_dontfold_vectors(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: underflow_compare_dontfold_vectors:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: psubb %xmm1, %xmm2
+; CHECK-NEXT: pminub %xmm2, %xmm0
+; CHECK-NEXT: retq
+ %sub = sub <16 x i8> %a, %b
+ %cond = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %sub, <16 x i8> %a)
+ ret <16 x i8> %cond
+}
+
+; Negative test, pattern mismatch : umin(add(a,b),a)
+define i64 @umin_add(i64 %a, i64 %b) {
+; CHECK-LABEL: umin_add:
+; CHECK: # %bb.0:
+; CHECK-NEXT: leaq (%rsi,%rdi), %rax
+; CHECK-NEXT: cmpq %rdi, %rax
+; CHECK-NEXT: cmovaeq %rdi, %rax
+; CHECK-NEXT: retq
+ %add = add i64 %a, %b
+ %cond = tail call i64 @llvm.umin.i64(i64 %add, i64 %a)
+ ret i64 %cond
+}
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index eacc714..5a68484 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -10,10 +10,10 @@ declare <4 x i32> @llvm.umul.fix.v4i32(<4 x i32>, <4 x i32>, i32)
define i32 @func(i32 %x, i32 %y) nounwind {
; X64-LABEL: func:
; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: movl %edi, %ecx
-; X64-NEXT: imulq %rax, %rcx
-; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: imulq %rcx, %rax
+; X64-NEXT: movl %eax, %ecx
; X64-NEXT: shrq $32, %rax
; X64-NEXT: shldl $30, %ecx, %eax
; X64-NEXT: # kill: def $eax killed $eax killed $rax
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index 759055d..1a92365 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -138,22 +138,25 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
; SSE2-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [683,u,819,u]
; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1024,2048,2048,2]
+; SSE2-NEXT: pmuludq %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; SSE2-NEXT: movl $1463, %eax # imm = 0x5B7
; SSE2-NEXT: movd %eax, %xmm3
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [2048,u,2,u]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2047,2047,2047,2047]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: psrld $1, %xmm0
-; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3]
-; SSE2-NEXT: pslld $10, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3]
+; SSE2-NEXT: orps %xmm2, %xmm3
+; SSE2-NEXT: andps %xmm1, %xmm3
; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp)
; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
index 94c7892..3d0d73b 100644
--- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll
@@ -1,7 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; SSE-LABEL: fold_urem_vec_1:
@@ -110,16 +112,27 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; Don't fold if we can combine urem with udiv.
define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
-; SSE-LABEL: combine_urem_udiv:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
-; SSE-NEXT: pmulhuw %xmm0, %xmm1
-; SSE-NEXT: psrlw $6, %xmm1
-; SSE-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
-; SSE-NEXT: pmullw %xmm1, %xmm2
-; SSE-NEXT: psubw %xmm2, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: combine_urem_udiv:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE2-NEXT: pmulhuw %xmm0, %xmm1
+; SSE2-NEXT: psrlw $6, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
+; SSE2-NEXT: pmullw %xmm1, %xmm2
+; SSE2-NEXT: psubw %xmm2, %xmm0
+; SSE2-NEXT: paddw %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: combine_urem_udiv:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
+; SSE4-NEXT: pmulhuw %xmm0, %xmm1
+; SSE4-NEXT: psrlw $6, %xmm1
+; SSE4-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
+; SSE4-NEXT: pmullw %xmm1, %xmm2
+; SSE4-NEXT: psubw %xmm2, %xmm0
+; SSE4-NEXT: paddw %xmm1, %xmm0
+; SSE4-NEXT: retq
;
; AVX-LABEL: combine_urem_udiv:
; AVX: # %bb.0:
@@ -137,24 +150,43 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; Don't fold for divisors that are a power of two.
define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
-; SSE-LABEL: dont_fold_urem_power_of_two:
-; SSE: # %bb.0:
-; SSE-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: andl $31, %eax
-; SSE-NEXT: pinsrw $1, %eax, %xmm1
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: andl $7, %eax
-; SSE-NEXT: pinsrw $2, %eax, %xmm1
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
-; SSE-NEXT: shrl $22, %ecx
-; SSE-NEXT: imull $95, %ecx, %ecx
-; SSE-NEXT: subl %ecx, %eax
-; SSE-NEXT: pinsrw $3, %eax, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: dont_fold_urem_power_of_two:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63]
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pextrw $1, %xmm0, %eax
+; SSE2-NEXT: andl $31, %eax
+; SSE2-NEXT: pinsrw $1, %eax, %xmm1
+; SSE2-NEXT: pextrw $2, %xmm0, %eax
+; SSE2-NEXT: andl $7, %eax
+; SSE2-NEXT: pinsrw $2, %eax, %xmm1
+; SSE2-NEXT: pextrw $3, %xmm0, %eax
+; SSE2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
+; SSE2-NEXT: shrl $22, %ecx
+; SSE2-NEXT: imull $95, %ecx, %ecx
+; SSE2-NEXT: subl %ecx, %eax
+; SSE2-NEXT: pinsrw $3, %eax, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: dont_fold_urem_power_of_two:
+; SSE4: # %bb.0:
+; SSE4-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63]
+; SSE4-NEXT: pand %xmm0, %xmm1
+; SSE4-NEXT: pextrw $1, %xmm0, %eax
+; SSE4-NEXT: andl $31, %eax
+; SSE4-NEXT: pinsrw $1, %eax, %xmm1
+; SSE4-NEXT: pextrw $2, %xmm0, %eax
+; SSE4-NEXT: andl $7, %eax
+; SSE4-NEXT: pinsrw $2, %eax, %xmm1
+; SSE4-NEXT: pextrw $3, %xmm0, %eax
+; SSE4-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
+; SSE4-NEXT: shrl $22, %ecx
+; SSE4-NEXT: imull $95, %ecx, %ecx
+; SSE4-NEXT: subl %ecx, %eax
+; SSE4-NEXT: pinsrw $3, %eax, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
+; SSE4-NEXT: retq
;
; AVX1-LABEL: dont_fold_urem_power_of_two:
; AVX1: # %bb.0:
@@ -190,6 +222,23 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; AVX2-NEXT: subl %ecx, %eax
; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: dont_fold_urem_power_of_two:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
+; AVX512-NEXT: vpextrw $1, %xmm0, %eax
+; AVX512-NEXT: andl $31, %eax
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX512-NEXT: vpextrw $2, %xmm0, %eax
+; AVX512-NEXT: andl $7, %eax
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512-NEXT: vpextrw $3, %xmm0, %eax
+; AVX512-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
+; AVX512-NEXT: shrl $22, %ecx
+; AVX512-NEXT: imull $95, %ecx, %ecx
+; AVX512-NEXT: subl %ecx, %eax
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX512-NEXT: retq
%1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
ret <4 x i16> %1
}
@@ -228,36 +277,67 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: dont_fold_urem_one:
-; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
-; AVX-NEXT: shrl $16, %ecx
-; AVX-NEXT: movl %eax, %edx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: movzwl %dx, %edx
-; AVX-NEXT: shrl %edx
-; AVX-NEXT: addl %ecx, %edx
-; AVX-NEXT: shrl $4, %edx
-; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
-; AVX-NEXT: shll $3, %ecx
-; AVX-NEXT: subl %ecx, %edx
-; AVX-NEXT: addl %eax, %edx
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
-; AVX-NEXT: shrl $25, %ecx
-; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
-; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
-; AVX-NEXT: shrl $26, %ecx
-; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
-; AVX-NEXT: subl %ecx, %eax
-; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: dont_fold_urem_one:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vpextrw $2, %xmm0, %eax
+; AVX1OR2-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
+; AVX1OR2-NEXT: shrl $16, %ecx
+; AVX1OR2-NEXT: movl %eax, %edx
+; AVX1OR2-NEXT: subl %ecx, %edx
+; AVX1OR2-NEXT: movzwl %dx, %edx
+; AVX1OR2-NEXT: shrl %edx
+; AVX1OR2-NEXT: addl %ecx, %edx
+; AVX1OR2-NEXT: shrl $4, %edx
+; AVX1OR2-NEXT: leal (%rdx,%rdx,2), %ecx
+; AVX1OR2-NEXT: shll $3, %ecx
+; AVX1OR2-NEXT: subl %ecx, %edx
+; AVX1OR2-NEXT: addl %eax, %edx
+; AVX1OR2-NEXT: vpextrw $1, %xmm0, %eax
+; AVX1OR2-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
+; AVX1OR2-NEXT: shrl $25, %ecx
+; AVX1OR2-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
+; AVX1OR2-NEXT: subl %ecx, %eax
+; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax
+; AVX1OR2-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
+; AVX1OR2-NEXT: shrl $26, %ecx
+; AVX1OR2-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
+; AVX1OR2-NEXT: subl %ecx, %eax
+; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: dont_fold_urem_one:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpextrw $2, %xmm0, %eax
+; AVX512-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
+; AVX512-NEXT: shrl $16, %ecx
+; AVX512-NEXT: movl %eax, %edx
+; AVX512-NEXT: subl %ecx, %edx
+; AVX512-NEXT: movzwl %dx, %edx
+; AVX512-NEXT: shrl %edx
+; AVX512-NEXT: addl %ecx, %edx
+; AVX512-NEXT: shrl $4, %edx
+; AVX512-NEXT: leal (%rdx,%rdx,2), %ecx
+; AVX512-NEXT: shll $3, %ecx
+; AVX512-NEXT: subl %ecx, %edx
+; AVX512-NEXT: vpextrw $1, %xmm0, %ecx
+; AVX512-NEXT: addl %eax, %edx
+; AVX512-NEXT: imull $51307, %ecx, %eax # imm = 0xC86B
+; AVX512-NEXT: shrl $25, %eax
+; AVX512-NEXT: imull $654, %eax, %eax # imm = 0x28E
+; AVX512-NEXT: subl %eax, %ecx
+; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
+; AVX512-NEXT: vpextrw $3, %xmm0, %eax
+; AVX512-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
+; AVX512-NEXT: shrl $26, %ecx
+; AVX512-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
+; AVX512-NEXT: subl %ecx, %eax
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
+; AVX512-NEXT: retq
%1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
ret <4 x i16> %1
}
@@ -267,49 +347,96 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; CHECK-LABEL: dont_fold_urem_i16_smax:
; CHECK: # %bb.0:
; CHECK-NEXT: retq
+; SSE-LABEL: dont_fold_urem_i16_smax:
+; SSE: # %bb.0:
+; SSE-NEXT: retq
+;
+; AVX-LABEL: dont_fold_urem_i16_smax:
+; AVX: # %bb.0:
+; AVX-NEXT: retq
%1 = urem <4 x i16> %x, <i16 1, i16 65536, i16 23, i16 5423>
ret <4 x i16> %1
}
; Don't fold i64 urem.
define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
-; SSE-LABEL: dont_fold_urem_i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movq %xmm1, %rcx
-; SSE-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: mulq %rdx
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: subq %rdx, %rax
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: addq %rdx, %rax
-; SSE-NEXT: shrq $4, %rax
-; SSE-NEXT: leaq (%rax,%rax,2), %rdx
-; SSE-NEXT: shlq $3, %rdx
-; SSE-NEXT: subq %rdx, %rax
-; SSE-NEXT: addq %rcx, %rax
-; SSE-NEXT: movq %rax, %xmm2
-; SSE-NEXT: pextrq $1, %xmm1, %rcx
-; SSE-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: mulq %rdx
-; SSE-NEXT: shrq $12, %rdx
-; SSE-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
-; SSE-NEXT: subq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT: pextrq $1, %xmm0, %rcx
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: shrq %rax
-; SSE-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
-; SSE-NEXT: mulq %rdx
-; SSE-NEXT: shrq $7, %rdx
-; SSE-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
-; SSE-NEXT: subq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: retq
+; SSE2-LABEL: dont_fold_urem_i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: movq %xmm1, %rcx
+; SSE2-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rdx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: shrq %rax
+; SSE2-NEXT: addq %rdx, %rax
+; SSE2-NEXT: shrq $4, %rax
+; SSE2-NEXT: leaq (%rax,%rax,2), %rdx
+; SSE2-NEXT: shlq $3, %rdx
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: addq %rcx, %rax
+; SSE2-NEXT: movq %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE2-NEXT: movq %xmm2, %rcx
+; SSE2-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: mulq %rdx
+; SSE2-NEXT: shrq $12, %rdx
+; SSE2-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
+; SSE2-NEXT: subq %rax, %rcx
+; SSE2-NEXT: movq %rcx, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT: movq %xmm0, %rcx
+; SSE2-NEXT: movq %rcx, %rax
+; SSE2-NEXT: shrq %rax
+; SSE2-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
+; SSE2-NEXT: mulq %rdx
+; SSE2-NEXT: shrq $7, %rdx
+; SSE2-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
+; SSE2-NEXT: subq %rax, %rcx
+; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: dont_fold_urem_i64:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movq %xmm1, %rcx
+; SSE4-NEXT: movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
+; SSE4-NEXT: movq %rcx, %rax
+; SSE4-NEXT: mulq %rdx
+; SSE4-NEXT: movq %rcx, %rax
+; SSE4-NEXT: subq %rdx, %rax
+; SSE4-NEXT: shrq %rax
+; SSE4-NEXT: addq %rdx, %rax
+; SSE4-NEXT: shrq $4, %rax
+; SSE4-NEXT: leaq (%rax,%rax,2), %rdx
+; SSE4-NEXT: shlq $3, %rdx
+; SSE4-NEXT: subq %rdx, %rax
+; SSE4-NEXT: addq %rcx, %rax
+; SSE4-NEXT: movq %rax, %xmm2
+; SSE4-NEXT: pextrq $1, %xmm1, %rcx
+; SSE4-NEXT: movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
+; SSE4-NEXT: movq %rcx, %rax
+; SSE4-NEXT: mulq %rdx
+; SSE4-NEXT: shrq $12, %rdx
+; SSE4-NEXT: imulq $5423, %rdx, %rax # imm = 0x152F
+; SSE4-NEXT: subq %rax, %rcx
+; SSE4-NEXT: movq %rcx, %xmm1
+; SSE4-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE4-NEXT: pextrq $1, %xmm0, %rcx
+; SSE4-NEXT: movq %rcx, %rax
+; SSE4-NEXT: shrq %rax
+; SSE4-NEXT: movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
+; SSE4-NEXT: mulq %rdx
+; SSE4-NEXT: shrq $7, %rdx
+; SSE4-NEXT: imulq $654, %rdx, %rax # imm = 0x28E
+; SSE4-NEXT: subq %rax, %rcx
+; SSE4-NEXT: movq %rcx, %xmm0
+; SSE4-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE4-NEXT: movdqa %xmm2, %xmm1
+; SSE4-NEXT: retq
;
; AVX1-LABEL: dont_fold_urem_i64:
; AVX1: # %bb.0:
@@ -388,6 +515,43 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
+;
+; AVX512-LABEL: dont_fold_urem_i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovq %xmm1, %rdx
+; AVX512-NEXT: movabsq $7218291159277650633, %rax # imm = 0x642C8590B21642C9
+; AVX512-NEXT: mulxq %rax, %rax, %rax
+; AVX512-NEXT: movq %rdx, %rcx
+; AVX512-NEXT: subq %rax, %rcx
+; AVX512-NEXT: shrq %rcx
+; AVX512-NEXT: addq %rax, %rcx
+; AVX512-NEXT: shrq $4, %rcx
+; AVX512-NEXT: leaq (%rcx,%rcx,2), %rax
+; AVX512-NEXT: shlq $3, %rax
+; AVX512-NEXT: subq %rax, %rcx
+; AVX512-NEXT: addq %rdx, %rcx
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT: movabsq $-4513890722074972339, %rax # imm = 0xC15B704DCBCA2F4D
+; AVX512-NEXT: mulxq %rax, %rax, %rax
+; AVX512-NEXT: vmovq %rcx, %xmm1
+; AVX512-NEXT: shrq $12, %rax
+; AVX512-NEXT: imulq $5423, %rax, %rax # imm = 0x152F
+; AVX512-NEXT: subq %rax, %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: movq %rax, %rdx
+; AVX512-NEXT: shrq %rdx
+; AVX512-NEXT: movabsq $7220743857598845893, %rcx # imm = 0x64353C48064353C5
+; AVX512-NEXT: mulxq %rcx, %rcx, %rcx
+; AVX512-NEXT: shrq $7, %rcx
+; AVX512-NEXT: imulq $654, %rcx, %rcx # imm = 0x28E
+; AVX512-NEXT: subq %rcx, %rax
+; AVX512-NEXT: vmovq %rax, %xmm0
+; AVX512-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
%1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
ret <4 x i64> %1
}
diff --git a/llvm/test/CodeGen/X86/ushl_sat.ll b/llvm/test/CodeGen/X86/ushl_sat.ll
index e0e1ef7..9768e47 100644
--- a/llvm/test/CodeGen/X86/ushl_sat.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat.ll
@@ -14,23 +14,23 @@ define i16 @func(i16 %x, i16 %y) nounwind {
; X64-LABEL: func:
; X64: # %bb.0:
; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: movl %edi, %edx
-; X64-NEXT: shll %cl, %edx
-; X64-NEXT: movzwl %dx, %eax
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: shll %cl, %edi
+; X64-NEXT: movzwl %di, %edx
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-NEXT: shrl %cl, %eax
-; X64-NEXT: cmpw %ax, %di
+; X64-NEXT: shrl %cl, %edx
+; X64-NEXT: cmpw %dx, %ax
; X64-NEXT: movl $65535, %eax # imm = 0xFFFF
-; X64-NEXT: cmovel %edx, %eax
+; X64-NEXT: cmovel %edi, %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
;
; X86-LABEL: func:
; X86: # %bb.0:
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %eax, %edx
; X86-NEXT: shll %cl, %edx
; X86-NEXT: movzwl %dx, %esi
; X86-NEXT: shrl %cl, %esi
@@ -51,14 +51,14 @@ define i16 @func2(i8 %x, i8 %y) nounwind {
; X64-NEXT: movsbl %dil, %eax
; X64-NEXT: addl %eax, %eax
; X64-NEXT: movl %eax, %edx
-; X64-NEXT: shll %cl, %edx
-; X64-NEXT: movzwl %dx, %esi
+; X64-NEXT: shll %cl, %eax
+; X64-NEXT: movzwl %ax, %esi
; X64-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NEXT: shrl %cl, %esi
-; X64-NEXT: cmpw %si, %ax
-; X64-NEXT: movl $65535, %eax # imm = 0xFFFF
-; X64-NEXT: cmovel %edx, %eax
-; X64-NEXT: cwtl
+; X64-NEXT: cmpw %si, %dx
+; X64-NEXT: movl $65535, %ecx # imm = 0xFFFF
+; X64-NEXT: cmovel %eax, %ecx
+; X64-NEXT: movswl %cx, %eax
; X64-NEXT: shrl %eax
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/ushl_sat_vec.ll b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
index b8e83da..762088c 100644
--- a/llvm/test/CodeGen/X86/ushl_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ushl_sat_vec.ll
@@ -300,95 +300,94 @@ define <8 x i16> @vec_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $12, %esp
+; X86-NEXT: subl $16, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ebp, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: movzwl %bx, %edi
-; X86-NEXT: shrl %cl, %edi
-; X86-NEXT: cmpw %di, %ax
-; X86-NEXT: movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT: cmovnel %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %edx, %ecx
; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movzwl %ax, %edi
-; X86-NEXT: shrl %cl, %edi
-; X86-NEXT: cmpw %di, %si
+; X86-NEXT: movzwl %ax, %esi
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: cmpw %si, %dx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $65535, %esi # imm = 0xFFFF
-; X86-NEXT: cmovnel %esi, %eax
+; X86-NEXT: movl $65535, %edx # imm = 0xFFFF
+; X86-NEXT: cmovnel %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movzwl %ax, %edx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: cmpw %dx, %bp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmovnel %esi, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %ebp
; X86-NEXT: shll %cl, %ebp
-; X86-NEXT: movzwl %bp, %edx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: cmpw %dx, %si
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzwl %bp, %eax
+; X86-NEXT: shrl %cl, %eax
+; X86-NEXT: cmpw %ax, %di
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmovnel %eax, %ebp
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: cmovnel %edx, %ebp
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: movzwl %bx, %esi
-; X86-NEXT: shrl %cl, %esi
-; X86-NEXT: cmpw %si, %dx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzwl %bx, %edx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: cmpw %dx, %ax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl $65535, %esi # imm = 0xFFFF
; X86-NEXT: cmovnel %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movzwl %di, %edx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: cmpw %dx, %ax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: cmovnel %esi, %edi
+; X86-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: shll %cl, %ebp
+; X86-NEXT: movzwl %bp, %edx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: cmpw %dx, %ax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmovnel %esi, %ebp
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movzwl %di, %eax
-; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: cmpw %ax, %dx
+; X86-NEXT: movzwl %di, %edx
+; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: cmpw %dx, %ax
; X86-NEXT: cmovnel %esi, %edi
+; X86-NEXT: movl $65535, %ebx # imm = 0xFFFF
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movzwl %si, %eax
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movzwl %dx, %eax
; X86-NEXT: shrl %cl, %eax
-; X86-NEXT: cmpw %ax, %dx
-; X86-NEXT: movl $65535, %eax # imm = 0xFFFF
-; X86-NEXT: cmovnel %eax, %esi
+; X86-NEXT: cmpw %ax, %si
+; X86-NEXT: cmovnel %ebx, %edx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movzwl %ax, %edx
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmpw %dx, %cx
+; X86-NEXT: movzwl %ax, %esi
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: cmpw %si, %bx
; X86-NEXT: movl $65535, %ecx # imm = 0xFFFF
; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movw %ax, 14(%ecx)
-; X86-NEXT: movw %si, 12(%ecx)
+; X86-NEXT: movw %dx, 12(%ecx)
; X86-NEXT: movw %di, 10(%ecx)
-; X86-NEXT: movw %bx, 8(%ecx)
-; X86-NEXT: movw %bp, 6(%ecx)
+; X86-NEXT: movw %bp, 8(%ecx)
; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movw %ax, 6(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movw %ax, 4(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movw %ax, 2(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movw %ax, (%ecx)
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: addl $12, %esp
+; X86-NEXT: addl $16, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/vector-compress-freeze.ll b/llvm/test/CodeGen/X86/vector-compress-freeze.ll
new file mode 100644
index 0000000..981557f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vector-compress-freeze.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl | FileCheck %s
+
+declare <16 x i32> @llvm.experimental.vector.compress.v16i32(<16 x i32>, <16 x i1>, <16 x i32>)
+
+define <16 x i32> @test_compress_freeze_elimination(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a3) {
+; CHECK-LABEL: test_compress_freeze_elimination:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; CHECK-NEXT: vpcompressd %zmm1, %zmm0 {%k1}
+; CHECK-NEXT: retq
+ %cmp = icmp sgt <16 x i32> %a0, %a1
+ %ext = zext <16 x i8> %a3 to <16 x i32>
+ %cpr = call <16 x i32> @llvm.experimental.vector.compress.v16i32(<16 x i32> %ext, <16 x i1> %cmp, <16 x i32> splat(i32 15))
+ %fr = freeze <16 x i32> %cpr
+ %and = and <16 x i32> %fr, splat(i32 255)
+ ret <16 x i32> %and
+}
+
+define <16 x i32> @test_compress_freeze(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a3) {
+; CHECK-LABEL: test_compress_freeze:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1
+; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; CHECK-NEXT: vpcompressd %zmm0, %zmm0 {%k1}
+; CHECK-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %cmp = icmp sgt <16 x i32> %a0, %a1
+ %ext = zext <16 x i8> %a3 to <16 x i32>
+ %cpr = call <16 x i32> @llvm.experimental.vector.compress.v16i32(<16 x i32> %ext, <16 x i1> %cmp, <16 x i32> poison)
+ %fr = freeze <16 x i32> %cpr
+ %and = and <16 x i32> %fr, splat(i32 255)
+ ret <16 x i32> %and
+}
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 4a5b427..88d3ad1 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -4143,11 +4143,11 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; CHECK-NEXT: movd %edx, %xmm1
+; CHECK-NEXT: movd %ecx, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
@@ -4155,10 +4155,10 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f32() #0 {
; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f32:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -4256,11 +4256,11 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 {
; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f32:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm0
+; AVX1-NEXT: vmovq %rcx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -4268,11 +4268,11 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f32() #0 {
; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -4382,11 +4382,11 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fptosi_v3i32_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; CHECK-NEXT: movd %edx, %xmm1
+; CHECK-NEXT: movd %ecx, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
@@ -4394,10 +4394,10 @@ define <3 x i32> @constrained_vector_fptosi_v3i32_v3f64() #0 {
; AVX-LABEL: constrained_vector_fptosi_v3i32_v3f64:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; AVX-NEXT: vmovd %edx, %xmm0
+; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -4498,11 +4498,11 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 {
; AVX1-LABEL: constrained_vector_fptosi_v3i64_v3f64:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm0
+; AVX1-NEXT: vmovq %rcx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -4510,11 +4510,11 @@ define <3 x i64> @constrained_vector_fptosi_v3i64_v3f64() #0 {
; AVX512-LABEL: constrained_vector_fptosi_v3i64_v3f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -4645,11 +4645,11 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; CHECK-NEXT: movd %edx, %xmm1
+; CHECK-NEXT: movd %ecx, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT: cvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
@@ -4658,19 +4658,19 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f32() #0 {
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
+; AVX1-NEXT: vcvttss2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm0
-; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; AVX512-NEXT: vmovd %edx, %xmm0
+; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
@@ -4911,7 +4911,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
;
; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
; AVX1-NEXT: vcomiss %xmm2, %xmm0
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
@@ -4921,51 +4921,51 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
; AVX1-NEXT: vmovaps %xmm0, %xmm3
; AVX1-NEXT: .LBB123_2: # %entry
; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vcvttss2si %xmm2, %rax
-; AVX1-NEXT: setbe %cl
-; AVX1-NEXT: movzbl %cl, %ecx
-; AVX1-NEXT: shlq $63, %rcx
-; AVX1-NEXT: xorq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
-; AVX1-NEXT: vmovss {{.*#+}} xmm3 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
-; AVX1-NEXT: vcomiss %xmm3, %xmm0
-; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcvttss2si %xmm2, %rcx
+; AVX1-NEXT: setbe %al
+; AVX1-NEXT: movzbl %al, %eax
+; AVX1-NEXT: shlq $63, %rax
+; AVX1-NEXT: xorq %rcx, %rax
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT: vcomiss %xmm2, %xmm0
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
; AVX1-NEXT: ja .LBB123_4
; AVX1-NEXT: # %bb.3: # %entry
-; AVX1-NEXT: vmovaps %xmm0, %xmm4
+; AVX1-NEXT: vmovaps %xmm0, %xmm3
; AVX1-NEXT: .LBB123_4: # %entry
-; AVX1-NEXT: vsubss %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vcvttss2si %xmm3, %rax
+; AVX1-NEXT: vsubss %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcvttss2si %xmm2, %rdx
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
-; AVX1-NEXT: xorq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vmovss {{.*#+}} xmm3 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
-; AVX1-NEXT: vcomiss %xmm3, %xmm0
+; AVX1-NEXT: xorq %rdx, %rcx
+; AVX1-NEXT: vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT: vcomiss %xmm2, %xmm0
; AVX1-NEXT: ja .LBB123_6
; AVX1-NEXT: # %bb.5: # %entry
; AVX1-NEXT: vmovaps %xmm0, %xmm1
; AVX1-NEXT: .LBB123_6: # %entry
-; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm0
-; AVX1-NEXT: vcvttss2si %xmm0, %rax
-; AVX1-NEXT: setbe %cl
-; AVX1-NEXT: movzbl %cl, %ecx
-; AVX1-NEXT: shlq $63, %rcx
-; AVX1-NEXT: xorq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vcvttss2si %xmm0, %rdx
+; AVX1-NEXT: setbe %sil
+; AVX1-NEXT: movzbl %sil, %esi
+; AVX1-NEXT: shlq $63, %rsi
+; AVX1-NEXT: xorq %rdx, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm0
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f32:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vcvttss2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -5194,11 +5194,11 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fptoui_v3i32_v3f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; CHECK-NEXT: movd %eax, %xmm1
-; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; CHECK-NEXT: movd %eax, %xmm0
+; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; CHECK-NEXT: movd %edx, %xmm1
+; CHECK-NEXT: movd %ecx, %xmm0
; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT: cvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; CHECK-NEXT: movd %eax, %xmm1
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
@@ -5207,19 +5207,19 @@ define <3 x i32> @constrained_vector_fptoui_v3i32_v3f64() #0 {
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
+; AVX1-NEXT: vcvttsd2si {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX1-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i32_v3f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vmovd %eax, %xmm0
-; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
-; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
+; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ecx
+; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %edx
+; AVX512-NEXT: vmovd %edx, %xmm0
+; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX512-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0
; AVX512-NEXT: retq
entry:
@@ -5466,7 +5466,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
;
; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = [4.2299999999999997E+1,0.0E+0]
; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = [9.2233720368547758E+18,0.0E+0]
; AVX1-NEXT: vcomisd %xmm2, %xmm0
; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1
@@ -5476,51 +5476,51 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
; AVX1-NEXT: vmovapd %xmm0, %xmm3
; AVX1-NEXT: .LBB131_2: # %entry
; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vcvttsd2si %xmm2, %rax
-; AVX1-NEXT: setbe %cl
-; AVX1-NEXT: movzbl %cl, %ecx
-; AVX1-NEXT: shlq $63, %rcx
-; AVX1-NEXT: xorq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
-; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = [4.2100000000000001E+1,0.0E+0]
-; AVX1-NEXT: vcomisd %xmm3, %xmm0
-; AVX1-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcvttsd2si %xmm2, %rcx
+; AVX1-NEXT: setbe %al
+; AVX1-NEXT: movzbl %al, %eax
+; AVX1-NEXT: shlq $63, %rax
+; AVX1-NEXT: xorq %rcx, %rax
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = [4.2100000000000001E+1,0.0E+0]
+; AVX1-NEXT: vcomisd %xmm2, %xmm0
+; AVX1-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; AVX1-NEXT: ja .LBB131_4
; AVX1-NEXT: # %bb.3: # %entry
-; AVX1-NEXT: vmovapd %xmm0, %xmm4
+; AVX1-NEXT: vmovapd %xmm0, %xmm3
; AVX1-NEXT: .LBB131_4: # %entry
-; AVX1-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vcvttsd2si %xmm3, %rax
+; AVX1-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vcvttsd2si %xmm2, %rdx
; AVX1-NEXT: setbe %cl
; AVX1-NEXT: movzbl %cl, %ecx
; AVX1-NEXT: shlq $63, %rcx
-; AVX1-NEXT: xorq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vmovsd {{.*#+}} xmm3 = [4.2299999999999997E+1,0.0E+0]
-; AVX1-NEXT: vcomisd %xmm3, %xmm0
+; AVX1-NEXT: xorq %rdx, %rcx
+; AVX1-NEXT: vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; AVX1-NEXT: vcomisd %xmm2, %xmm0
; AVX1-NEXT: ja .LBB131_6
; AVX1-NEXT: # %bb.5: # %entry
; AVX1-NEXT: vmovapd %xmm0, %xmm1
; AVX1-NEXT: .LBB131_6: # %entry
-; AVX1-NEXT: vsubsd %xmm1, %xmm3, %xmm0
-; AVX1-NEXT: vcvttsd2si %xmm0, %rax
-; AVX1-NEXT: setbe %cl
-; AVX1-NEXT: movzbl %cl, %ecx
-; AVX1-NEXT: shlq $63, %rcx
-; AVX1-NEXT: xorq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm0
+; AVX1-NEXT: vcvttsd2si %xmm0, %rdx
+; AVX1-NEXT: setbe %sil
+; AVX1-NEXT: movzbl %sil, %esi
+; AVX1-NEXT: shlq $63, %rsi
+; AVX1-NEXT: xorq %rdx, %rsi
+; AVX1-NEXT: vmovq %rsi, %xmm0
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovq %rax, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_fptoui_v3i64_v3f64:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm0
-; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
-; AVX512-NEXT: vmovq %rax, %xmm1
+; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
+; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rdx
+; AVX512-NEXT: vmovq %rdx, %xmm0
+; AVX512-NEXT: vmovq %rcx, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vcvttsd2usi {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
@@ -5731,26 +5731,26 @@ entry:
define <3 x float> @constrained_vector_fptrunc_v3f64() #0 {
; CHECK-LABEL: constrained_vector_fptrunc_v3f64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1
; CHECK-NEXT: movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm0
-; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT: movsd {{.*#+}} xmm1 = [4.2299999999999997E+1,0.0E+0]
-; CHECK-NEXT: cvtsd2ss %xmm1, %xmm1
+; CHECK-NEXT: movsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT: cvtsd2ss %xmm2, %xmm2
+; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_fptrunc_v3f64:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
; AVX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [4.2100000000000001E+1,0.0E+0]
; AVX-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [4.2299999999999997E+1,0.0E+0]
-; AVX-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; AVX-NEXT: vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT: vcvtsd2ss %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
; AVX-NEXT: retq
entry:
%result = call <3 x float> @llvm.experimental.constrained.fptrunc.v3f32.v3f64(
@@ -5834,14 +5834,14 @@ define <3 x double> @constrained_vector_fpext_v3f32() #0 {
;
; AVX-LABEL: constrained_vector_fpext_v3f32:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
; AVX-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT: vmovss {{.*#+}} xmm1 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
-; AVX-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT: vcvtss2sd %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: retq
entry:
%result = call <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32(
@@ -6702,14 +6702,14 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 {
;
; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vextractps $1, %xmm0, %eax
+; AVX-NEXT: vextractps $2, %xmm0, %eax
; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm1
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm2
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
entry:
%result = call <3 x double>
@@ -6722,31 +6722,31 @@ entry:
define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT: movd %xmm1, %eax
+; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %eax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; CHECK-NEXT: movd %xmm2, %eax
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: cvtsi2ss %eax, %xmm2
-; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %eax, %xmm0
-; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32:
; AVX: # %bb.0: # %entry
-; AVX-NEXT: vextractps $1, %xmm0, %eax
+; AVX-NEXT: vextractps $2, %xmm0, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm1
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm2
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX-NEXT: vpextrd $2, %xmm0, %eax
+; AVX-NEXT: vpextrd $1, %xmm0, %eax
; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-NEXT: retq
entry:
%result = call <3 x float>
@@ -6769,28 +6769,28 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 {
;
; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_sitofp_v3f64_v3i64:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2
-; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x double>
@@ -6803,39 +6803,38 @@ entry:
define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_sitofp_v3f32_v3i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: cvtsi2ss %rsi, %xmm1
-; CHECK-NEXT: cvtsi2ss %rdi, %xmm0
-; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rdx, %xmm1
+; CHECK-NEXT: cvtsi2ss %rdi, %xmm0
+; CHECK-NEXT: cvtsi2ss %rsi, %xmm2
+; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_sitofp_v3f32_v3i64:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
@@ -7415,26 +7414,26 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 {
;
; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractps $1, %xmm0, %eax
+; AVX1-NEXT: vextractps $2, %xmm0, %eax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vextractps $1, %xmm0, %eax
+; AVX512-NEXT: vextractps $2, %xmm0, %eax
; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm1
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm2
-; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512-NEXT: vpextrd $1, %xmm0, %eax
; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm0
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x double>
@@ -7447,43 +7446,43 @@ entry:
define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i32:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movd %xmm0, %eax
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT: movd %xmm1, %eax
+; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: cvtsi2ss %rax, %xmm1
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; CHECK-NEXT: movd %xmm2, %eax
; CHECK-NEXT: xorps %xmm2, %xmm2
; CHECK-NEXT: cvtsi2ss %rax, %xmm2
-; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; CHECK-NEXT: movd %xmm0, %eax
; CHECK-NEXT: xorps %xmm0, %xmm0
; CHECK-NEXT: cvtsi2ss %rax, %xmm0
-; CHECK-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT: movaps %xmm1, %xmm0
+; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i32:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractps $1, %xmm0, %eax
+; AVX1-NEXT: vextractps $2, %xmm0, %eax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1
; AVX1-NEXT: vmovd %xmm0, %eax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT: vpextrd $2, %xmm0, %eax
+; AVX1-NEXT: vpextrd $1, %xmm0, %eax
; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vextractps $1, %xmm0, %eax
+; AVX512-NEXT: vextractps $2, %xmm0, %eax
; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm1
; AVX512-NEXT: vmovd %xmm0, %eax
; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm2
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512-NEXT: vpextrd $2, %xmm0, %eax
+; AVX512-NEXT: vpextrd $1, %xmm0, %eax
; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512-NEXT: retq
entry:
%result = call <3 x float>
@@ -7539,7 +7538,8 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
;
; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
@@ -7565,9 +7565,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
; AVX1-NEXT: # %bb.3:
; AVX1-NEXT: vaddsd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB183_4: # %entry
-; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
@@ -7580,20 +7578,21 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 {
; AVX1-NEXT: # %bb.5:
; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB183_6: # %entry
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2
-; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0
-; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
entry:
%result = call <3 x double>
@@ -7606,13 +7605,13 @@ entry:
define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
; CHECK-LABEL: constrained_vector_uitofp_v3f32_v3i64:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: movq %rsi, %rax
+; CHECK-NEXT: movq %rdx, %rax
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: movl %esi, %ecx
+; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
-; CHECK-NEXT: testq %rsi, %rsi
-; CHECK-NEXT: cmovnsq %rsi, %rcx
+; CHECK-NEXT: testq %rdx, %rdx
+; CHECK-NEXT: cmovnsq %rdx, %rcx
; CHECK-NEXT: cvtsi2ss %rcx, %xmm1
; CHECK-NEXT: jns .LBB184_2
; CHECK-NEXT: # %bb.1:
@@ -7630,26 +7629,26 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: addss %xmm0, %xmm0
; CHECK-NEXT: .LBB184_4: # %entry
-; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT: movq %rdx, %rax
+; CHECK-NEXT: movq %rsi, %rax
; CHECK-NEXT: shrq %rax
-; CHECK-NEXT: movl %edx, %ecx
+; CHECK-NEXT: movl %esi, %ecx
; CHECK-NEXT: andl $1, %ecx
; CHECK-NEXT: orq %rax, %rcx
-; CHECK-NEXT: testq %rdx, %rdx
-; CHECK-NEXT: cmovnsq %rdx, %rcx
-; CHECK-NEXT: xorps %xmm1, %xmm1
-; CHECK-NEXT: cvtsi2ss %rcx, %xmm1
+; CHECK-NEXT: testq %rsi, %rsi
+; CHECK-NEXT: cmovnsq %rsi, %rcx
+; CHECK-NEXT: cvtsi2ss %rcx, %xmm2
; CHECK-NEXT: jns .LBB184_6
; CHECK-NEXT: # %bb.5:
-; CHECK-NEXT: addss %xmm1, %xmm1
+; CHECK-NEXT: addss %xmm2, %xmm2
; CHECK-NEXT: .LBB184_6: # %entry
+; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; CHECK-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; CHECK-NEXT: retq
;
; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i64:
; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovq %xmm1, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
@@ -7675,9 +7674,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
; AVX1-NEXT: # %bb.3:
; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
; AVX1-NEXT: .LBB184_4: # %entry
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq %rcx
; AVX1-NEXT: movl %eax, %edx
@@ -7690,21 +7687,22 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 {
; AVX1-NEXT: # %bb.5:
; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
; AVX1-NEXT: .LBB184_6: # %entry
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i64:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpextrq $1, %xmm0, %rax
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2
-; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0
-; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
+; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 304daab..2e85a4e 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -319,9 +319,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -333,8 +333,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,u,1,u]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,32,u,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,u,u]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -345,8 +345,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,u,1,u]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,32,u,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,u,u]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -411,9 +411,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v2i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,u,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,u,1,u]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [32,32,u,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index ae5dd18..8db5414 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -499,11 +499,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-NEXT: psrld $28, %xmm1
; SSE2-NEXT: psrld $27, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,u,1,u]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE2-NEXT: pslld $4, %xmm0
+; SSE2-NEXT: pslld $5, %xmm2
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -514,7 +512,10 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE41-NEXT: psrld $27, %xmm2
; SSE41-NEXT: psrld $28, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16,32,1,1]
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: pslld $5, %xmm1
+; SSE41-NEXT: pslld $4, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
@@ -523,7 +524,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; AVX1-NEXT: vpsrld $27, %xmm1, %xmm2
; AVX1-NEXT: vpsrld $28, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16,32,1,1]
+; AVX1-NEXT: vpslld $5, %xmm0, %xmm2
+; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
+; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -597,11 +600,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; X86-SSE2-NEXT: psrld $28, %xmm1
; X86-SSE2-NEXT: psrld $27, %xmm2
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16,32,1,1]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [32,u,1,u]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: pslld $4, %xmm0
+; X86-SSE2-NEXT: pslld $5, %xmm2
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 33a6a76..30205259 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1989,11 +1989,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; SSE2-NEXT: paddb %xmm0, %xmm0
; SSE2-NEXT: movdqa %xmm0, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,1,2,4,8,16,32,64]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,u,1,u,2,u,4,u,8,u,16,u,32,u,64,u]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,64,32,16,8,4,2,1]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,u,64,u,32,u,16,u,8,u,4,u,2,u,1,u]
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
@@ -2014,7 +2014,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; SSE41-NEXT: psllw $8, %xmm1
; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -2033,7 +2033,7 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -2149,11 +2149,11 @@ define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
; X86-SSE2-NEXT: paddb %xmm0, %xmm0
; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [128,1,2,4,8,16,32,64]
+; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [128,u,1,u,2,u,4,u,8,u,16,u,32,u,64,u]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [128,64,32,16,8,4,2,1]
+; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [128,u,64,u,32,u,16,u,8,u,4,u,2,u,1,u]
; X86-SSE2-NEXT: pand %xmm3, %xmm0
; X86-SSE2-NEXT: packuswb %xmm2, %xmm0
; X86-SSE2-NEXT: por %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 217431be..0cffa1b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1631,9 +1631,9 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX1-NEXT: vpsllw $8, %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: vorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = [128,32,8,2,128,2,8,32]
-; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
+; AVX1-NEXT: vpmullw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1653,7 +1653,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; AVX2-NEXT: vpsllw $8, %ymm2, %ymm2
; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -1672,7 +1672,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; AVX512F-NEXT: vpsllw $8, %ymm2, %ymm2
; AVX512F-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -1690,7 +1690,7 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind {
; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64,0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
; AVX512VL-NEXT: vpsllw $8, %ymm2, %ymm2
-; AVX512VL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX512VL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | ymm1 | ymm2
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 3a522cc..25f8f94 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -915,10 +915,10 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512F-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
; AVX512F-NEXT: vpsllw $8, %ymm3, %ymm3
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX512F-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2
+; AVX512F-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3
@@ -957,10 +957,10 @@ define <64 x i8> @constant_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y) nounwind {
; AVX512VL-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
; AVX512VL-NEXT: vpsllw $8, %ymm3, %ymm3
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0,128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
+; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64,128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
; AVX512VL-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmaddubsw %ymm4, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmullw %ymm4, %ymm2, %ymm2
; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512VL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index 4b42b18..17bbfa1 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -341,9 +341,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE2-LABEL: constant_funnnel_v2i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
+; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -355,8 +355,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; SSE41-LABEL: constant_funnnel_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,u,1,u]
-; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,1,1]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [134217728,134217728,u,u]
+; SSE41-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [268435456,134217728,u,u]
; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -367,8 +367,8 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; AVX1-LABEL: constant_funnnel_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,u,1,u]
-; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,1,1]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [134217728,134217728,u,u]
+; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [268435456,134217728,u,u]
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
@@ -433,9 +433,9 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
; X86-SSE2-LABEL: constant_funnnel_v2i32:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,1,1]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [268435456,134217728,u,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
-; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,u,1,u]
+; X86-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [134217728,134217728,u,u]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index 2d8670a..144e77b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -497,42 +497,35 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %
define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
; SSE2-LABEL: constant_funnnel_v2i32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; SSE2-NEXT: psrld $4, %xmm1
; SSE2-NEXT: psrld $5, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: psrld $4, %xmm3
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; SSE2-NEXT: pslld $28, %xmm0
-; SSE2-NEXT: pslld $27, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: pslld $27, %xmm2
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_funnnel_v2i32:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psrld $5, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrld $4, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
+; SSE41-NEXT: psrld $4, %xmm1
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pslld $27, %xmm1
; SSE41-NEXT: pslld $28, %xmm0
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: por %xmm3, %xmm0
+; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_funnnel_v2i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vpsrld $5, %xmm1, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpsrld $4, %xmm1, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
+; AVX1-NEXT: vpsrld $4, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
; AVX1-NEXT: vpslld $27, %xmm0, %xmm2
; AVX1-NEXT: vpslld $28, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
@@ -606,17 +599,15 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
;
; X86-SSE2-LABEL: constant_funnnel_v2i32:
; X86-SSE2: # %bb.0:
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT: psrld $4, %xmm1
; X86-SSE2-NEXT: psrld $5, %xmm2
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT: psrld $4, %xmm3
-; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[2,3]
-; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
; X86-SSE2-NEXT: pslld $28, %xmm0
-; X86-SSE2-NEXT: pslld $27, %xmm1
-; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE2-NEXT: por %xmm3, %xmm0
+; X86-SSE2-NEXT: pslld $27, %xmm2
+; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X86-SSE2-NEXT: por %xmm1, %xmm0
; X86-SSE2-NEXT: retl
%res = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 4, i32 5>)
ret <2 x i32> %res
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index e68d1d7..3117865 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -691,11 +691,11 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: psubb %xmm2, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [14,13,12,11,10,9,9,7]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [14,u,13,u,12,u,11,u,10,u,9,u,9,u,7,u]
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,8,9,10,11,12,13,14]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [7,u,8,u,9,u,10,u,11,u,12,u,13,u,14,u]
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: packuswb %xmm2, %xmm1
; SSE2-NEXT: psubb %xmm1, %xmm0
@@ -731,7 +731,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: por %xmm1, %xmm2
; SSE41-NEXT: psubb %xmm2, %xmm0
@@ -762,7 +762,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index 7355f36..fa5692a 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -660,7 +660,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm5
; AVX1-NEXT: vpsubb %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm5 # [22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -686,7 +686,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22]
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
@@ -720,7 +720,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index 5445330..b11756a 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -544,7 +544,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm5
; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm5 # [38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -570,7 +570,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsubb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38]
; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38]
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -603,7 +603,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmovb2m %zmm1, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm2
; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 6cd5098..cbc2b96 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -787,13 +787,13 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [64,64,32,32,32,128,128,64]
; SSE2-NEXT: psrlw $8, %xmm3
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [14,13,12,11,10,9,9,7]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [14,u,13,u,12,u,11,u,10,u,9,u,9,u,7,u]
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm4, %xmm3
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,256,128,32,32,32,64,64]
; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,u,8,u,9,u,10,u,11,u,12,u,13,u,14,u]
; SSE2-NEXT: pand %xmm4, %xmm2
; SSE2-NEXT: packuswb %xmm3, %xmm2
; SSE2-NEXT: psubb %xmm2, %xmm0
@@ -840,7 +840,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE41-NEXT: por %xmm1, %xmm2
; SSE41-NEXT: psubb %xmm2, %xmm0
@@ -882,7 +882,7 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [0,8,0,10,0,12,0,14,0,13,0,11,0,9,0,7]
; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,0,9,0,11,0,13,0,14,0,12,0,10,0,9,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [7,8,9,10,11,12,13,14,14,13,12,11,10,9,9,7]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
index 98ea87c..ca57359 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -702,7 +702,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [32,16,16,128,64,16,256,32]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpackuswb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm5 # [22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm5
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -739,7 +739,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 # [64,256,128,32,32,32,64,64]
; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22]
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22]
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
@@ -781,7 +781,7 @@ define <32 x i8> @test_remconstant_32i8(<32 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpackuswb %ymm3, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
; AVX2NOBW-NEXT: vpsllw $8, %ymm2, %ymm2
-; AVX2NOBW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX2NOBW-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX2NOBW-NEXT: vpsubb %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
index a11fa370..b8a131e 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll
@@ -575,7 +575,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [64,16,32,8,8,8,256,16,32,16,16,128,64,16,256,32]
; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3
; AVX512F-NEXT: vpackuswb %ymm4, %ymm3, %ymm4
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5 # [38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
; AVX512F-NEXT: vpand %ymm3, %ymm5, %ymm5
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 # [0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
@@ -609,7 +609,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,256,128,32,32,32,64,64,16,16,64,32,128,256,16,16]
; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512F-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0]
+; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38]
; AVX512F-NEXT: vpand %ymm3, %ymm4, %ymm3
; AVX512F-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38]
; AVX512F-NEXT: vpsllw $8, %ymm1, %ymm1
@@ -648,7 +648,7 @@ define <64 x i8> @test_remconstant_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpackuswb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,0,9,0,11,0,13,0,15,0,17,0,19,0,21,0,23,0,25,0,27,0,29,0,31,0,33,0,35,0,37,0,38,0,36,0,34,0,32,0,30,0,28,0,26,0,24,0,22,0,20,0,18,0,16,0,14,0,12,0,10,0,8,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm2 # [7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7]
; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 # [0,8,0,10,0,12,0,14,0,16,0,18,0,20,0,22,0,24,0,26,0,28,0,30,0,32,0,34,0,36,0,38,0,37,0,35,0,33,0,31,0,29,0,27,0,25,0,23,0,21,0,19,0,17,0,15,0,13,0,11,0,9,0,7]
; AVX512BW-NEXT: vpsllw $8, %zmm1, %zmm1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 | (zmm2 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index dbb4b9f..e0410ae 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -84,11 +84,11 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: movq %xmm1, (%rsi)
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
-; SSE-NEXT: movq %xmm1, (%rsi)
; SSE-NEXT: movq %xmm0, (%rdx)
; SSE-NEXT: retq
;
@@ -96,8 +96,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovq %xmm1, (%rsi)
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovq %xmm0, (%rdx)
; AVX-NEXT: retq
;
@@ -105,8 +105,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovq %xmm1, (%rsi)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovq %xmm0, (%rdx)
; AVX2-NEXT: retq
;
@@ -114,8 +114,8 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vmovq %xmm1, (%rsi)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vmovq %xmm0, (%rdx)
; AVX2-FP-NEXT: retq
;
@@ -123,17 +123,17 @@ define void @load_i16_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vmovq %xmm1, (%rsi)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i16_stride2_vf4:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovdw %xmm0, (%rsi)
-; AVX512-NEXT: vmovq %xmm1, (%rdx)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm0, (%rdx)
; AVX512-NEXT: retq
%wide.vec = load <8 x i16>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index da902b3..c932482 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -196,18 +196,18 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,3,4,5,6,7]
+; SSE-NEXT: movq %xmm2, (%rsi)
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,3,2,3,4,5,6,7]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
+; SSE-NEXT: movq %xmm1, (%rdx)
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: movq %xmm2, (%rsi)
-; SSE-NEXT: movq %xmm1, (%rdx)
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: movq %xmm0, (%rcx)
; SSE-NEXT: retq
;
@@ -217,14 +217,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm2, (%rsi)
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm2, (%rdx)
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm2, (%rsi)
-; AVX-NEXT: vmovq %xmm3, (%rdx)
; AVX-NEXT: vmovq %xmm0, (%rcx)
; AVX-NEXT: retq
;
@@ -234,14 +234,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm2, (%rdx)
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-NEXT: vmovq %xmm3, (%rdx)
; AVX2-NEXT: vmovq %xmm0, (%rcx)
; AVX2-NEXT: retq
;
@@ -251,13 +251,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm2, (%rdx)
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
; AVX2-FP-NEXT: vmovq %xmm0, (%rcx)
; AVX2-FP-NEXT: retq
;
@@ -267,13 +267,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX2-FCP-NEXT: retq
;
@@ -283,14 +283,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm2, (%rsi)
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm2, (%rdx)
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-NEXT: vmovq %xmm3, (%rdx)
; AVX512-NEXT: vmovq %xmm0, (%rcx)
; AVX512-NEXT: retq
;
@@ -300,13 +300,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512-FCP-NEXT: retq
;
@@ -316,14 +316,14 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-NEXT: retq
;
@@ -333,13 +333,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-FCP-NEXT: retq
;
@@ -348,15 +348,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
-; AVX512BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
-; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm3
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm1, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm2, (%rcx)
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
+; AVX512BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3]
+; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -365,13 +366,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
-; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
-; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rcx)
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
+; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11]
+; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -380,15 +381,16 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
-; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm2, %ymm1
; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,3,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,1,2,3]
-; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm3
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx)
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
+; AVX512DQ-BW-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,1,2,3]
+; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -397,13 +399,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,6,3,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [1,4,7,10,4,7,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [2,5,8,11,2,3,10,11]
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm3, %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [1,4,7,10,4,7,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [2,5,8,11,2,3,10,11]
+; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <12 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index 01aacc1..d4e5d4c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -220,20 +220,20 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE-NEXT: movq %xmm5, (%rsi)
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: movq %xmm3, (%rdx)
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: movq %xmm3, (%rcx)
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movq %xmm5, (%rsi)
-; SSE-NEXT: movq %xmm3, (%rdx)
-; SSE-NEXT: movq %xmm4, (%rcx)
; SSE-NEXT: movq %xmm0, (%r8)
; SSE-NEXT: retq
;
@@ -246,23 +246,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
; AVX-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX-NEXT: vmovq %xmm0, (%rsi)
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX-NEXT: vmovq %xmm0, (%rdx)
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX-NEXT: vmovq %xmm2, (%rcx)
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: vmovq %xmm3, (%rdx)
-; AVX-NEXT: vmovq %xmm4, (%rcx)
-; AVX-NEXT: vmovq %xmm1, (%r8)
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT: vmovq %xmm0, (%r8)
; AVX-NEXT: retq
;
; AVX2-LABEL: load_i16_stride4_vf4:
@@ -274,23 +274,23 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX2-NEXT: vmovq %xmm0, (%rdx)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-NEXT: vmovq %xmm1, (%r8)
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-NEXT: vmovq %xmm0, (%r8)
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride4_vf4:
@@ -302,22 +302,22 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX2-FP-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FP-NEXT: vmovq %xmm0, (%rdx)
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FP-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-FP-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm1, (%r8)
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-FP-NEXT: vmovq %xmm0, (%r8)
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i16_stride4_vf4:
@@ -329,125 +329,125 @@ define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[2,0,2,3,4,5,6,7]
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm1, (%r8)
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%r8)
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i16_stride4_vf4:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1
-; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm3
; AVX512-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1
; AVX512-NEXT: vpmovqw %ymm1, (%rdx)
-; AVX512-NEXT: vpmovqw %ymm2, (%rcx)
-; AVX512-NEXT: vpmovqw %ymm3, (%r8)
+; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX512-NEXT: vpmovqw %ymm1, (%rcx)
+; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqw %ymm0, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride4_vf4:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
; AVX512-FCP-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rdx)
-; AVX512-FCP-NEXT: vpmovqw %ymm2, (%rcx)
-; AVX512-FCP-NEXT: vpmovqw %ymm3, (%r8)
+; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rcx)
+; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpmovqw %ymm0, (%r8)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride4_vf4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm3
; AVX512DQ-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1
; AVX512DQ-NEXT: vpmovqw %ymm1, (%rdx)
-; AVX512DQ-NEXT: vpmovqw %ymm2, (%rcx)
-; AVX512DQ-NEXT: vpmovqw %ymm3, (%r8)
+; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpmovqw %ymm1, (%rcx)
+; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovqw %ymm0, (%r8)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride4_vf4:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vpmovqw %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT: vpmovqw %ymm3, (%r8)
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rcx)
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i16_stride4_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1
-; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm3
; AVX512BW-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1
; AVX512BW-NEXT: vpmovqw %ymm1, (%rdx)
-; AVX512BW-NEXT: vpmovqw %ymm2, (%rcx)
-; AVX512BW-NEXT: vpmovqw %ymm3, (%r8)
+; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX512BW-NEXT: vpmovqw %ymm1, (%rcx)
+; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovqw %ymm0, (%r8)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i16_stride4_vf4:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
-; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT: vpmovqw %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT: vpmovqw %ymm3, (%r8)
+; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rcx)
+; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%r8)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i16_stride4_vf4:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm3
; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1
; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rdx)
-; AVX512DQ-BW-NEXT: vpmovqw %ymm2, (%rcx)
-; AVX512DQ-BW-NEXT: vpmovqw %ymm3, (%r8)
+; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rcx)
+; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%r8)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf4:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm3, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%r8)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <16 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 9b19ec1..8fb6222 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -288,55 +288,55 @@ define void @load_i16_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
; SSE-LABEL: load_i16_stride5_vf4:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm2
-; SSE-NEXT: movdqa 16(%rdi), %xmm3
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
; SSE-NEXT: movdqa 32(%rdi), %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrlq $48, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,0,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[3,0]
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm7, %xmm2
-; SSE-NEXT: pandn %xmm0, %xmm7
-; SSE-NEXT: por %xmm2, %xmm7
-; SSE-NEXT: movq %xmm1, (%rsi)
-; SSE-NEXT: movq %xmm4, (%rdx)
-; SSE-NEXT: movq %xmm5, (%rcx)
-; SSE-NEXT: movq %xmm6, (%r8)
-; SSE-NEXT: movq %xmm7, (%r9)
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: movq %xmm4, (%rsi)
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: psrlq $48, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,3,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
+; SSE-NEXT: movq %xmm3, (%rdx)
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,0,3,4,5,6,7]
+; SSE-NEXT: movq %xmm3, (%rcx)
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: movq %xmm3, (%r8)
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
+; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pandn %xmm0, %xmm3
+; SSE-NEXT: por %xmm1, %xmm3
+; SSE-NEXT: movq %xmm3, (%r9)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i16_stride5_vf4:
@@ -349,30 +349,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX-NEXT: vpsrlq $48, %xmm2, %xmm4
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7]
; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: vmovq %xmm4, (%rdx)
-; AVX-NEXT: vmovq %xmm5, (%rcx)
-; AVX-NEXT: vmovq %xmm6, (%r8)
-; AVX-NEXT: vmovq %xmm1, (%r9)
+; AVX-NEXT: vpsrlq $48, %xmm2, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm0, (%rdx)
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm0, (%rcx)
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vmovq %xmm0, (%r8)
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
+; AVX-NEXT: vmovq %xmm0, (%r9)
; AVX-NEXT: retq
;
; AVX2-LABEL: load_i16_stride5_vf4:
@@ -385,22 +385,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm3, (%rsi)
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm3, (%rcx)
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm3, (%r8)
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-NEXT: vmovq %xmm5, (%rcx)
-; AVX2-NEXT: vmovq %xmm6, (%r8)
; AVX2-NEXT: vmovq %xmm0, (%r9)
; AVX2-NEXT: retq
;
@@ -412,22 +412,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm3, (%rcx)
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm3, (%r8)
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm5, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm6, (%r8)
; AVX2-FP-NEXT: vmovq %xmm0, (%r9)
; AVX2-FP-NEXT: retq
;
@@ -439,58 +439,64 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX2-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i16_stride5_vf4:
; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vpextrw $5, %xmm0, %eax
-; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
; AVX512-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512-NEXT: vpextrw $6, %xmm0, %eax
-; AVX512-NEXT: vpextrw $1, %xmm0, %r10d
-; AVX512-NEXT: vmovd %r10d, %xmm4
-; AVX512-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
-; AVX512-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1
-; AVX512-NEXT: vmovd %xmm2, %eax
+; AVX512-NEXT: vpextrw $5, %xmm0, %r10d
+; AVX512-NEXT: vmovd %xmm2, %r11d
+; AVX512-NEXT: vpextrw $3, %xmm1, %ebx
+; AVX512-NEXT: vpextrw $6, %xmm0, %ebp
+; AVX512-NEXT: vpextrw $1, %xmm0, %r14d
+; AVX512-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX512-NEXT: vmovq %xmm1, (%rsi)
+; AVX512-NEXT: vmovd %r14d, %xmm1
+; AVX512-NEXT: vpinsrw $1, %ebp, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX512-NEXT: vpinsrw $3, %r11d, %xmm1, %xmm1
+; AVX512-NEXT: vmovq %xmm1, (%rdx)
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm1, (%rcx)
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm1, (%r8)
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-NEXT: vmovq %xmm1, (%rdx)
-; AVX512-NEXT: vmovq %xmm5, (%rcx)
-; AVX512-NEXT: vmovq %xmm6, (%r8)
; AVX512-NEXT: vmovq %xmm0, (%r9)
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride5_vf4:
@@ -498,65 +504,71 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
; AVX512-FCP-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovd %xmm2, %eax
+; AVX512-FCP-NEXT: vmovd %xmm2, %r10d
+; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %r11d
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX512-FCP-NEXT: vmovq %xmm1, (%rsi)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpinsrw $3, %r10d, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx)
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm1, (%r8)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride5_vf4:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: pushq %rbp
+; AVX512DQ-NEXT: pushq %r14
+; AVX512DQ-NEXT: pushq %rbx
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT: vpextrw $5, %xmm0, %eax
-; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
; AVX512DQ-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpextrw $6, %xmm0, %eax
-; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r10d
-; AVX512DQ-NEXT: vmovd %r10d, %xmm4
-; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1
-; AVX512DQ-NEXT: vmovd %xmm2, %eax
+; AVX512DQ-NEXT: vpextrw $5, %xmm0, %r10d
+; AVX512DQ-NEXT: vmovd %xmm2, %r11d
+; AVX512DQ-NEXT: vpextrw $3, %xmm1, %ebx
+; AVX512DQ-NEXT: vpextrw $6, %xmm0, %ebp
+; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r14d
+; AVX512DQ-NEXT: vpinsrw $1, %r10d, %xmm0, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX512DQ-NEXT: vmovq %xmm1, (%rsi)
+; AVX512DQ-NEXT: vmovd %r14d, %xmm1
+; AVX512DQ-NEXT: vpinsrw $1, %ebp, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpinsrw $2, %ebx, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpinsrw $3, %r11d, %xmm1, %xmm1
+; AVX512DQ-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm1, (%rcx)
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm1, (%r8)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm1, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-NEXT: popq %rbx
+; AVX512DQ-NEXT: popq %r14
+; AVX512DQ-NEXT: popq %rbp
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride5_vf4:
@@ -564,29 +576,29 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovd %xmm2, %eax
+; AVX512DQ-FCP-NEXT: vmovd %xmm2, %r10d
+; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %r11d
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rsi)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpinsrw $2, %r11d, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpinsrw $3, %r10d, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm0[1,2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx)
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-FCP-NEXT: retq
;
@@ -600,19 +612,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
+; AVX512BW-NEXT: movl 32(%rdi), %edi
; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BW-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
-; AVX512BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-NEXT: vmovq %xmm2, (%r9)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r8)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -626,19 +639,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
+; AVX512BW-FCP-NEXT: movl 32(%rdi), %edi
; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BW-FCP-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -652,19 +666,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
+; AVX512DQ-BW-NEXT: movl 32(%rdi), %edi
; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm5, %zmm2
; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -678,19 +693,20 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
+; AVX512DQ-BW-FCP-NEXT: movl 32(%rdi), %edi
; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm5, %zmm2
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,8,13,18,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,9,14,19,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <20 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index feb75b2..dc8a9ed 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -382,57 +382,57 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa 16(%rdi), %xmm1
-; SSE-NEXT: movdqa 32(%rdi), %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,6,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; SSE-NEXT: movdqa 32(%rdi), %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,6,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT: movq %xmm2, (%rsi)
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: psrld $16, %xmm6
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: por %xmm5, %xmm4
+; SSE-NEXT: movq %xmm4, (%rdx)
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3]
+; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pandn %xmm5, %xmm6
-; SSE-NEXT: movdqa %xmm1, %xmm7
-; SSE-NEXT: psrld $16, %xmm7
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: por %xmm6, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,3,2,3]
-; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE-NEXT: movdqa %xmm2, %xmm8
-; SSE-NEXT: pandn %xmm5, %xmm8
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,0,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm2, %xmm9
-; SSE-NEXT: por %xmm8, %xmm9
+; SSE-NEXT: pandn %xmm3, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm3
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm3[0,2,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,2,3,4,5,6,7]
+; SSE-NEXT: pand %xmm2, %xmm7
+; SSE-NEXT: por %xmm6, %xmm7
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm2, %xmm5
-; SSE-NEXT: pandn %xmm6, %xmm2
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; SSE-NEXT: movq %xmm7, (%rcx)
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: pandn %xmm5, %xmm2
+; SSE-NEXT: por %xmm3, %xmm2
+; SSE-NEXT: movq %xmm2, (%r8)
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,2,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: movq %xmm3, (%r9)
; SSE-NEXT: psrlq $48, %xmm1
; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[1,3,2,3,4,5,6,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movq %xmm4, (%rsi)
-; SSE-NEXT: movq %xmm3, (%rdx)
-; SSE-NEXT: movq %xmm9, (%rcx)
-; SSE-NEXT: movq %xmm2, (%r8)
-; SSE-NEXT: movq %xmm6, (%r9)
; SSE-NEXT: movq %xmm0, (%rax)
; SSE-NEXT: retq
;
@@ -448,32 +448,32 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm5
+; AVX-NEXT: vmovq %xmm4, (%rsi)
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm4
; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5],xmm6[6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
+; AVX-NEXT: vmovq %xmm3, (%rdx)
+; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[4,5,0,1,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovq %xmm3, (%rcx)
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm2[4,5],xmm4[6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm3, (%r8)
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,2,2,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX-NEXT: vmovq %xmm3, (%r9)
; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,3,2,3,4,5,6,7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm4, (%rsi)
-; AVX-NEXT: vmovq %xmm3, (%rdx)
-; AVX-NEXT: vmovq %xmm5, (%rcx)
-; AVX-NEXT: vmovq %xmm6, (%r8)
-; AVX-NEXT: vmovq %xmm7, (%r9)
; AVX-NEXT: vmovq %xmm0, (%rax)
; AVX-NEXT: retq
;
@@ -486,24 +486,24 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpsrld $16, %xmm1, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm3, (%rsi)
+; AVX2-NEXT: vpsrld $16, %xmm1, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm3, (%r8)
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-NEXT: vmovq %xmm6, (%rcx)
-; AVX2-NEXT: vmovq %xmm5, (%r8)
; AVX2-NEXT: vmovq %xmm1, (%r9)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: retq
;
@@ -516,23 +516,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm4
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
+; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm3, (%r8)
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm6, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm5, (%r8)
; AVX2-FP-NEXT: vmovq %xmm1, (%r9)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FP-NEXT: retq
;
@@ -545,23 +545,23 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm4
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0],xmm3[1,2],xmm1[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%r8)
; AVX2-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FCP-NEXT: retq
;
@@ -574,26 +574,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512-NEXT: vmovq %xmm3, (%rsi)
; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512-NEXT: vpermd %zmm2, %zmm5, %zmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm3, (%rsi)
; AVX512-NEXT: vmovq %xmm0, (%rdx)
-; AVX512-NEXT: vmovq %xmm4, (%rcx)
-; AVX512-NEXT: vmovq %xmm1, (%r8)
-; AVX512-NEXT: vmovq %xmm5, (%r9)
-; AVX512-NEXT: vmovq %xmm2, (%rax)
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7]
+; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm1, (%rcx)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm0, (%r8)
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11]
+; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm1, (%r9)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -606,25 +606,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
; AVX512-FCP-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm1, (%r8)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512-FCP-NEXT: vmovq %xmm2, (%rax)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7]
+; AVX512-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11]
+; AVX512-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -637,26 +637,26 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
; AVX512DQ-NEXT: vmovq %xmm0, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm1, (%r8)
-; AVX512DQ-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-NEXT: vmovq %xmm2, (%rax)
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7]
+; AVX512DQ-NEXT: vpermd %zmm4, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm1, (%rcx)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11]
+; AVX512DQ-NEXT: vpermd %zmm4, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm1, (%r9)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -669,25 +669,25 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,1,10,7]
+; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [8,5,2,11]
+; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -697,22 +697,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm1
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
-; AVX512BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-NEXT: vmovq %xmm5, (%r9)
-; AVX512BW-NEXT: vmovq %xmm1, (%rax)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r8)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r9)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -722,22 +722,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -747,22 +747,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm1
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -772,22 +772,22 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,7,13,19,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,8,14,20,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,9,15,21,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,10,16,22,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,11,17,23,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,7,13,19,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,8,14,20,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,9,15,21,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,10,16,22,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,11,17,23,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <24 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 038c73b..e89248a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -418,77 +418,77 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
; SSE-LABEL: load_i16_stride7_vf4:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm1
-; SSE-NEXT: movdqa 16(%rdi), %xmm4
-; SSE-NEXT: movdqa 32(%rdi), %xmm3
-; SSE-NEXT: movdqa 48(%rdi), %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: pandn %xmm2, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: por %xmm5, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm4, %xmm7
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movdqa 32(%rdi), %xmm1
+; SSE-NEXT: movdqa 48(%rdi), %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: pandn %xmm3, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: pandn %xmm1, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7]
-; SSE-NEXT: pand %xmm0, %xmm7
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE-NEXT: pandn %xmm5, %xmm0
-; SSE-NEXT: por %xmm7, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7]
-; SSE-NEXT: movdqa %xmm1, %xmm8
-; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3]
+; SSE-NEXT: por %xmm6, %xmm7
+; SSE-NEXT: movq %xmm7, (%rsi)
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm2, %xmm6
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: pandn %xmm0, %xmm3
+; SSE-NEXT: por %xmm6, %xmm3
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[1,0,3,3,4,5,6,7]
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: movdqa %xmm1, %xmm3
+; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm6, %xmm5
+; SSE-NEXT: movq %xmm5, (%rdx)
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,3,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
-; SSE-NEXT: movdqa %xmm3, %xmm10
-; SSE-NEXT: movdqa %xmm3, %xmm9
-; SSE-NEXT: psrlq $16, %xmm9
-; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1]
-; SSE-NEXT: pslld $16, %xmm6
-; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
-; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
-; SSE-NEXT: movdqa %xmm1, %xmm10
-; SSE-NEXT: psrld $16, %xmm10
-; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE-NEXT: psrlq $48, %xmm4
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; SSE-NEXT: movq %xmm7, (%rcx)
+; SSE-NEXT: movdqa %xmm1, %xmm7
+; SSE-NEXT: movdqa %xmm1, %xmm6
+; SSE-NEXT: psrlq $16, %xmm6
; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1]
+; SSE-NEXT: pslld $16, %xmm4
+; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
-; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT: movq %xmm2, (%rsi)
-; SSE-NEXT: movq %xmm0, (%rdx)
-; SSE-NEXT: movq %xmm7, (%rcx)
-; SSE-NEXT: movq %xmm8, (%r8)
-; SSE-NEXT: movq %xmm6, (%r9)
-; SSE-NEXT: movq %xmm10, (%rdi)
-; SSE-NEXT: movq %xmm1, (%rax)
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; SSE-NEXT: movq %xmm5, (%r8)
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: psrld $16, %xmm5
+; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: psrlq $48, %xmm2
+; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT: movq %xmm4, (%r9)
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; SSE-NEXT: movq %xmm5, (%rcx)
+; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movq %xmm0, (%rax)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i16_stride7_vf4:
@@ -497,54 +497,54 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,3,3]
; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,3]
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%rsi)
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3],xmm4[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%rdx)
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
-; AVX-NEXT: vpslld $16, %xmm2, %xmm9
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
-; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7]
-; AVX-NEXT: vpsrlq $16, %xmm4, %xmm9
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
-; AVX-NEXT: vpsrlq $48, %xmm1, %xmm10
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm10
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; AVX-NEXT: vmovq %xmm4, (%rcx)
+; AVX-NEXT: vpslld $16, %xmm2, %xmm4
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%r8)
+; AVX-NEXT: vpsrlq $16, %xmm3, %xmm4
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7]
+; AVX-NEXT: vpsrlq $48, %xmm1, %xmm6
+; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%r9)
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm4
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
+; AVX-NEXT: vmovq %xmm3, (%r10)
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vmovq %xmm3, (%rsi)
-; AVX-NEXT: vmovq %xmm5, (%rdx)
-; AVX-NEXT: vmovq %xmm7, (%rcx)
-; AVX-NEXT: vmovq %xmm8, (%r8)
-; AVX-NEXT: vmovq %xmm9, (%r9)
-; AVX-NEXT: vmovq %xmm4, (%r10)
; AVX-NEXT: vmovq %xmm0, (%rax)
; AVX-NEXT: retq
;
@@ -552,51 +552,51 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2: # %bb.0:
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX2-NEXT: vmovdqa (%rdi), %xmm3
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-NEXT: vmovdqa (%rdi), %xmm2
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7]
+; AVX2-NEXT: vmovq %xmm5, (%rsi)
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
+; AVX2-NEXT: vmovq %xmm5, (%rdx)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
+; AVX2-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm2, (%r8)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-NEXT: vmovq %xmm2, (%r9)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-NEXT: vmovq %xmm2, (%r10)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX2-NEXT: vmovq %xmm1, (%rsi)
-; AVX2-NEXT: vmovq %xmm6, (%rdx)
-; AVX2-NEXT: vmovq %xmm3, (%rcx)
-; AVX2-NEXT: vmovq %xmm4, (%r8)
-; AVX2-NEXT: vmovq %xmm5, (%r9)
-; AVX2-NEXT: vmovq %xmm7, (%r10)
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -605,8 +605,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4
@@ -615,37 +615,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm5, (%rsi)
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm5, (%rdx)
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm8
-; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FP-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm2, (%r8)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX2-FP-NEXT: vmovq %xmm2, (%r9)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-FP-NEXT: vmovq %xmm2, (%r10)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-FP-NEXT: vmovq %xmm5, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm6, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm2, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm3, (%r8)
-; AVX2-FP-NEXT: vmovq %xmm4, (%r9)
-; AVX2-FP-NEXT: vmovq %xmm7, (%r10)
; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
@@ -654,8 +654,8 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
@@ -664,37 +664,37 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi)
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx)
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm8
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r8)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r10)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%r8)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%r9)
-; AVX2-FCP-NEXT: vmovq %xmm7, (%r10)
; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
@@ -708,47 +708,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3]
+; AVX512-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512-NEXT: vmovq %xmm5, (%rsi)
+; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
+; AVX512-NEXT: vmovq %xmm5, (%rdx)
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: vmovq %xmm1, (%rdx)
; AVX512-NEXT: vmovq %xmm2, (%rcx)
-; AVX512-NEXT: vmovq %xmm5, (%r8)
-; AVX512-NEXT: vmovq %xmm6, (%r9)
-; AVX512-NEXT: vmovq %xmm7, (%r10)
-; AVX512-NEXT: vmovq %xmm3, (%rax)
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm2, (%r8)
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovq %xmm2, (%r9)
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovq %xmm2, (%r10)
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT: vmovq %xmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -756,48 +756,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
-; AVX512-FCP-NEXT: vmovq %xmm6, (%r9)
-; AVX512-FCP-NEXT: vmovq %xmm7, (%r10)
-; AVX512-FCP-NEXT: vmovq %xmm2, (%rax)
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovq %xmm0, (%r10)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -810,47 +810,47 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3]
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovq %xmm5, (%rsi)
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vmovq %xmm5, (%rdx)
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm1, (%rdx)
; AVX512DQ-NEXT: vmovq %xmm2, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm5, (%r8)
-; AVX512DQ-NEXT: vmovq %xmm6, (%r9)
-; AVX512DQ-NEXT: vmovq %xmm7, (%r10)
-; AVX512DQ-NEXT: vmovq %xmm3, (%rax)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm2, (%r8)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vmovq %xmm2, (%r10)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -858,48 +858,48 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
-; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9)
-; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10)
-; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm4[1,2,3,4,5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r10)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -910,25 +910,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm1
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
-; AVX512BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-NEXT: vmovq %xmm5, (%r9)
-; AVX512BW-NEXT: vmovq %xmm6, (%r10)
-; AVX512BW-NEXT: vmovq %xmm1, (%rax)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r8)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r9)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r10)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -939,25 +939,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r10)
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r10)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -968,25 +968,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm1
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r10)
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r10)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -997,25 +997,25 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,22,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,9,16,23,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,10,17,24,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,11,18,25,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,12,19,26,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,13,20,27,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,8,15,22,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,9,16,23,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,10,17,24,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,11,18,25,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,12,19,26,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,13,20,27,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <28 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index fff21f9..b249950 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -296,41 +296,41 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
; SSE-NEXT: movdqa (%rdi), %xmm0
-; SSE-NEXT: movdqa 16(%rdi), %xmm1
-; SSE-NEXT: movdqa 32(%rdi), %xmm2
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
+; SSE-NEXT: movdqa 32(%rdi), %xmm1
; SSE-NEXT: movdqa 48(%rdi), %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; SSE-NEXT: movdqa %xmm0, %xmm5
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; SSE-NEXT: movdqa %xmm5, %xmm6
; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[3,3,3,3]
+; SSE-NEXT: movq %xmm6, (%rsi)
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; SSE-NEXT: movq %xmm7, (%rdx)
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[3,3,3,3]
; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE-NEXT: movq %xmm5, (%rcx)
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE-NEXT: movq %xmm6, (%r8)
+; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT: movq %xmm2, (%r9)
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: movq %xmm6, (%rsi)
-; SSE-NEXT: movq %xmm8, (%rdx)
-; SSE-NEXT: movq %xmm5, (%rcx)
-; SSE-NEXT: movq %xmm7, (%r8)
-; SSE-NEXT: movq %xmm1, (%r9)
-; SSE-NEXT: movq %xmm4, (%r11)
+; SSE-NEXT: movq %xmm3, (%r11)
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: movq %xmm0, (%r10)
-; SSE-NEXT: movq %xmm3, (%rax)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT: movq %xmm2, (%rax)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i16_stride8_vf4:
@@ -345,28 +345,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX-NEXT: vmovq %xmm6, (%rsi)
+; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT: vmovq %xmm6, (%rdx)
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX-NEXT: vmovq %xmm6, (%rcx)
; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%r8)
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vmovq %xmm6, (%rsi)
-; AVX-NEXT: vmovq %xmm7, (%rdx)
-; AVX-NEXT: vmovq %xmm8, (%rcx)
-; AVX-NEXT: vmovq %xmm4, (%r8)
; AVX-NEXT: vmovq %xmm1, (%r9)
-; AVX-NEXT: vmovq %xmm3, (%r11)
-; AVX-NEXT: vmovq %xmm5, (%r10)
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
+; AVX-NEXT: vmovq %xmm1, (%r11)
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX-NEXT: vmovq %xmm1, (%r10)
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
; AVX-NEXT: vmovq %xmm0, (%rax)
; AVX-NEXT: retq
;
@@ -382,28 +382,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-NEXT: vmovq %xmm6, (%rsi)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX2-NEXT: vmovq %xmm6, (%rdx)
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-NEXT: vmovq %xmm6, (%rcx)
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT: vmovq %xmm4, (%r8)
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-NEXT: vmovq %xmm6, (%rsi)
-; AVX2-NEXT: vmovq %xmm7, (%rdx)
-; AVX2-NEXT: vmovq %xmm8, (%rcx)
-; AVX2-NEXT: vmovq %xmm4, (%r8)
; AVX2-NEXT: vmovq %xmm1, (%r9)
-; AVX2-NEXT: vmovq %xmm3, (%r11)
-; AVX2-NEXT: vmovq %xmm5, (%r10)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-NEXT: vmovq %xmm1, (%r11)
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT: vmovq %xmm1, (%r10)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: retq
;
@@ -419,28 +419,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-FP-NEXT: vmovq %xmm6, (%rsi)
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX2-FP-NEXT: vmovq %xmm6, (%rdx)
+; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-FP-NEXT: vmovq %xmm6, (%rcx)
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vmovq %xmm4, (%r8)
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FP-NEXT: vmovq %xmm6, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm7, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm8, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm4, (%r8)
; AVX2-FP-NEXT: vmovq %xmm1, (%r9)
-; AVX2-FP-NEXT: vmovq %xmm3, (%r11)
-; AVX2-FP-NEXT: vmovq %xmm5, (%r10)
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-FP-NEXT: vmovq %xmm1, (%r11)
+; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-FP-NEXT: vmovq %xmm1, (%r10)
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FP-NEXT: retq
;
@@ -456,28 +456,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi)
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx)
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx)
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm4, (%r8)
; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm8, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%r8)
; AVX2-FCP-NEXT: vmovq %xmm1, (%r9)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%r11)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%r10)
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm1, (%r11)
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-FCP-NEXT: vmovq %xmm1, (%r10)
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FCP-NEXT: retq
;
@@ -493,25 +493,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
-; AVX512-NEXT: vpermt2d %xmm4, %xmm9, %xmm5
+; AVX512-NEXT: vmovq %xmm6, (%rsi)
+; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX512-NEXT: vmovq %xmm6, (%rdx)
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-NEXT: vmovq %xmm6, (%rcx)
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3]
+; AVX512-NEXT: vpermt2d %xmm4, %xmm6, %xmm5
+; AVX512-NEXT: vmovq %xmm5, (%r8)
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-NEXT: vpermt2d %xmm2, %xmm9, %xmm0
-; AVX512-NEXT: vmovq %xmm6, (%rsi)
-; AVX512-NEXT: vmovq %xmm7, (%rdx)
-; AVX512-NEXT: vmovq %xmm8, (%rcx)
-; AVX512-NEXT: vmovq %xmm5, (%r8)
; AVX512-NEXT: vmovq %xmm1, (%r9)
-; AVX512-NEXT: vmovq %xmm3, (%r11)
-; AVX512-NEXT: vmovq %xmm4, (%r10)
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX512-NEXT: vmovq %xmm1, (%r11)
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT: vmovq %xmm1, (%r10)
+; AVX512-NEXT: vpermt2d %xmm2, %xmm6, %xmm0
; AVX512-NEXT: vmovq %xmm0, (%rax)
; AVX512-NEXT: retq
;
@@ -527,25 +527,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
-; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm8
-; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
-; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5
+; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1]
+; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm7
+; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm6, %xmm7
+; AVX512-FCP-NEXT: vmovq %xmm7, (%rdx)
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512-FCP-NEXT: vmovq %xmm7, (%rcx)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3]
+; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm5
+; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0
-; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm8, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm9, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
; AVX512-FCP-NEXT: vmovq %xmm1, (%r9)
-; AVX512-FCP-NEXT: vmovq %xmm7, (%r11)
-; AVX512-FCP-NEXT: vmovq %xmm3, (%r10)
+; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm6
+; AVX512-FCP-NEXT: vmovq %xmm6, (%r11)
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-FCP-NEXT: vmovq %xmm1, (%r10)
+; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm7, %xmm0
; AVX512-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512-FCP-NEXT: retq
;
@@ -561,25 +561,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
-; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm9, %xmm5
+; AVX512DQ-NEXT: vmovq %xmm6, (%rsi)
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX512DQ-NEXT: vmovq %xmm6, (%rdx)
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-NEXT: vmovq %xmm6, (%rcx)
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,3,3]
+; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm6, %xmm5
+; AVX512DQ-NEXT: vmovq %xmm5, (%r8)
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm9, %xmm0
-; AVX512DQ-NEXT: vmovq %xmm6, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm7, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm8, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm5, (%r8)
; AVX512DQ-NEXT: vmovq %xmm1, (%r9)
-; AVX512DQ-NEXT: vmovq %xmm3, (%r11)
-; AVX512DQ-NEXT: vmovq %xmm4, (%r10)
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
+; AVX512DQ-NEXT: vmovq %xmm1, (%r11)
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512DQ-NEXT: vmovq %xmm1, (%r10)
+; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm6, %xmm0
; AVX512DQ-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-NEXT: retq
;
@@ -595,25 +595,25 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5
+; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,5,1,1]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm7
+; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm6, %xmm7
+; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%rdx)
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%rcx)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [3,7,3,3]
+; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm5
+; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0
-; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm9, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9)
-; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r11)
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r10)
+; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm6
+; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r11)
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r10)
+; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm7, %xmm0
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: retq
;
@@ -625,28 +625,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm7
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm8, %zmm1
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
-; AVX512BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-NEXT: vmovq %xmm5, (%r9)
-; AVX512BW-NEXT: vmovq %xmm6, (%r11)
-; AVX512BW-NEXT: vmovq %xmm7, (%r10)
-; AVX512BW-NEXT: vmovq %xmm1, (%rax)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r8)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r9)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r11)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%r10)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -658,28 +658,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11)
-; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10)
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r11)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r10)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -691,28 +691,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm7
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm8, %zmm1
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r11)
-; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10)
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r11)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r10)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -724,28 +724,28 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,9,17,25,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [2,10,18,26,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [3,11,19,27,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [4,12,20,28,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [5,13,21,29,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r11)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,14,22,30,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [7,15,23,31,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <32 x i16>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
index f2c5a91..995d641 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
@@ -20,8 +20,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE-NEXT: movq %xmm1, (%rsi)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; SSE-NEXT: movq %xmm0, (%rdx)
; SSE-NEXT: retq
;
@@ -29,8 +29,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX-NEXT: vmovlps %xmm1, (%rsi)
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX-NEXT: vmovlps %xmm0, (%rdx)
; AVX-NEXT: retq
;
@@ -38,8 +38,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps (%rdi), %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-NEXT: vmovlps %xmm1, (%rsi)
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-NEXT: vmovlps %xmm0, (%rdx)
; AVX2-NEXT: retq
;
@@ -47,8 +47,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0
; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-FP-NEXT: vmovlps %xmm1, (%rsi)
+; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx)
; AVX2-FP-NEXT: retq
;
@@ -56,8 +56,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-FCP-NEXT: vmovlps %xmm1, (%rsi)
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx)
; AVX2-FCP-NEXT: retq
;
@@ -65,8 +65,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512: # %bb.0:
; AVX512-NEXT: vmovaps (%rdi), %xmm0
; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512-NEXT: vmovlps %xmm1, (%rsi)
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512-NEXT: vmovlps %xmm0, (%rdx)
; AVX512-NEXT: retq
;
@@ -74,8 +74,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm0
; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512-FCP-NEXT: vmovlps %xmm1, (%rsi)
+; AVX512-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512-FCP-NEXT: vmovlps %xmm0, (%rdx)
; AVX512-FCP-NEXT: retq
;
@@ -83,8 +83,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512DQ-NEXT: vmovlps %xmm1, (%rsi)
+; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx)
; AVX512DQ-NEXT: retq
;
@@ -92,8 +92,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rsi)
+; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rdx)
; AVX512DQ-FCP-NEXT: retq
;
@@ -101,8 +101,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512BW-NEXT: vmovlps %xmm1, (%rsi)
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512BW-NEXT: vmovlps %xmm0, (%rdx)
; AVX512BW-NEXT: retq
;
@@ -110,8 +110,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm0
; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rsi)
+; AVX512BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rdx)
; AVX512BW-FCP-NEXT: retq
;
@@ -119,8 +119,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx)
; AVX512DQ-BW-NEXT: retq
;
@@ -128,8 +128,8 @@ define void @load_i32_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rdx)
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <4 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 34f2321..8af9594 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -21,13 +21,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa (%rdi), %xmm0
; SSE-NEXT: movdqa 16(%rdi), %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT: movq %xmm2, (%rsi)
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT: movq %xmm2, (%rdx)
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movq %xmm2, (%rsi)
-; SSE-NEXT: movq %xmm3, (%rdx)
; SSE-NEXT: movq %xmm0, (%rcx)
; SSE-NEXT: retq
;
@@ -36,12 +36,12 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vmovaps 16(%rdi), %xmm1
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3]
+; AVX-NEXT: vmovlps %xmm2, (%rsi)
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX-NEXT: vmovlps %xmm2, (%rdx)
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX-NEXT: vmovlps %xmm2, (%rsi)
-; AVX-NEXT: vmovlps %xmm3, (%rdx)
; AVX-NEXT: vmovlps %xmm0, (%rcx)
; AVX-NEXT: retq
;
@@ -50,13 +50,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovaps (%rdi), %xmm0
; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm3
-; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX2-NEXT: vmovlps %xmm2, (%rsi)
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX2-NEXT: vmovlps %xmm0, (%rdx)
-; AVX2-NEXT: vmovlps %xmm1, (%rcx)
+; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-NEXT: vmovlps %xmm0, (%rcx)
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i32_stride3_vf2:
@@ -64,13 +64,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0
; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm3
-; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi)
+; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx)
-; AVX2-FP-NEXT: vmovlps %xmm1, (%rcx)
+; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-FP-NEXT: vmovlps %xmm0, (%rcx)
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i32_stride3_vf2:
@@ -78,13 +78,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0
; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm3
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi)
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx)
-; AVX2-FCP-NEXT: vmovlps %xmm1, (%rcx)
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX2-FCP-NEXT: vmovlps %xmm0, (%rcx)
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i32_stride3_vf2:
@@ -92,13 +92,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovaps (%rdi), %xmm0
; AVX512-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512-NEXT: vbroadcastss 8(%rdi), %xmm3
-; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512-NEXT: vmovlps %xmm2, (%rsi)
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512-NEXT: vmovlps %xmm0, (%rdx)
-; AVX512-NEXT: vmovlps %xmm1, (%rcx)
+; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-NEXT: vmovlps %xmm0, (%rcx)
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i32_stride3_vf2:
@@ -119,13 +119,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512DQ-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512DQ-NEXT: vbroadcastss 8(%rdi), %xmm3
-; AVX512DQ-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512DQ-NEXT: vmovlps %xmm2, (%rsi)
+; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx)
-; AVX512DQ-NEXT: vmovlps %xmm1, (%rcx)
+; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-NEXT: vmovlps %xmm0, (%rcx)
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2:
@@ -146,13 +146,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512BW-NEXT: vbroadcastss 8(%rdi), %xmm3
-; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512BW-NEXT: vmovlps %xmm2, (%rsi)
+; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512BW-NEXT: vmovlps %xmm0, (%rdx)
-; AVX512BW-NEXT: vmovlps %xmm1, (%rcx)
+; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512BW-NEXT: vmovlps %xmm0, (%rcx)
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i32_stride3_vf2:
@@ -173,13 +173,13 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512DQ-BW-NEXT: vbroadcastss 8(%rdi), %xmm3
-; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rcx)
+; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rcx)
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 822d31e..f7ddcfc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -22,13 +22,13 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa 16(%rdi), %xmm1
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; SSE-NEXT: movq %xmm2, (%rsi)
-; SSE-NEXT: movq %xmm3, (%rdx)
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE-NEXT: movq %xmm2, (%rdx)
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSE-NEXT: movq %xmm0, (%rcx)
-; SSE-NEXT: movq %xmm1, (%r8)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: movq %xmm0, (%r8)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i32_stride4_vf2:
@@ -36,11 +36,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5,6,7]
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vmovq %xmm2, (%rsi)
-; AVX-NEXT: vmovq %xmm3, (%rdx)
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7]
+; AVX-NEXT: vmovq %xmm2, (%rdx)
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vmovq %xmm0, (%rcx)
; AVX-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX-NEXT: retq
@@ -50,11 +50,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-NEXT: vmovq %xmm2, (%rdx)
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vmovq %xmm0, (%rcx)
; AVX2-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX2-NEXT: retq
@@ -64,11 +64,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-FP-NEXT: vmovq %xmm2, (%rdx)
+; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FP-NEXT: vmovq %xmm0, (%rcx)
; AVX2-FP-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX2-FP-NEXT: retq
@@ -78,11 +78,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX2-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX2-FCP-NEXT: retq
@@ -92,11 +92,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-NEXT: vmovq %xmm3, (%rdx)
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512-NEXT: vmovq %xmm2, (%rdx)
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-NEXT: vmovq %xmm0, (%rcx)
; AVX512-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512-NEXT: retq
@@ -108,9 +108,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
; AVX512-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx)
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512-FCP-NEXT: vzeroupper
@@ -121,11 +121,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512DQ-NEXT: retq
@@ -137,9 +137,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
; AVX512DQ-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
@@ -150,11 +150,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512BW-NEXT: retq
@@ -166,9 +166,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
; AVX512BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
-; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512BW-FCP-NEXT: vzeroupper
@@ -179,11 +179,11 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512DQ-BW-NEXT: retq
@@ -195,9 +195,9 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,1,1]
; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index 4f80140..fea8ebd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -24,19 +24,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: movq %xmm4, (%rsi)
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT: movq %xmm4, (%rdx)
; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[3,3,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3]
; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: movq %xmm4, (%rsi)
-; SSE-NEXT: movq %xmm5, (%rdx)
; SSE-NEXT: movq %xmm0, (%rcx)
-; SSE-NEXT: movq %xmm6, (%r8)
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT: movq %xmm4, (%r8)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: movq %xmm1, (%r9)
; SSE-NEXT: retq
;
@@ -46,16 +46,16 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5],xmm1[6,7]
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
; AVX-NEXT: vmovq %xmm3, (%rsi)
-; AVX-NEXT: vmovq %xmm4, (%rdx)
-; AVX-NEXT: vpextrq $1, %xmm5, (%rcx)
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX-NEXT: vmovq %xmm3, (%rdx)
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
+; AVX-NEXT: vpextrq $1, %xmm1, (%rcx)
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX-NEXT: vmovq %xmm0, (%r8)
-; AVX-NEXT: vmovq %xmm1, (%r9)
+; AVX-NEXT: vmovq %xmm4, (%r9)
; AVX-NEXT: retq
;
; AVX2-LABEL: load_i32_stride5_vf2:
@@ -64,17 +64,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX2-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX2-NEXT: vpbroadcastd 16(%rdi), %ymm4
; AVX2-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX2-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; AVX2-NEXT: vpextrq $1, %xmm1, (%rcx)
+; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX2-NEXT: vmovq %xmm0, (%r8)
-; AVX2-NEXT: vmovq %xmm2, (%r9)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX2-NEXT: vmovq %xmm0, (%r9)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -84,17 +84,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX2-FP-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX2-FP-NEXT: vpbroadcastd 16(%rdi), %ymm4
; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; AVX2-FP-NEXT: vpextrq $1, %xmm1, (%rcx)
+; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX2-FP-NEXT: vmovq %xmm0, (%r8)
-; AVX2-FP-NEXT: vmovq %xmm2, (%r9)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX2-FP-NEXT: vmovq %xmm0, (%r9)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
@@ -104,17 +104,17 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
-; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX2-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX2-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm4
; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; AVX2-FCP-NEXT: vpextrq $1, %xmm1, (%rcx)
+; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX2-FCP-NEXT: vmovq %xmm0, (%r8)
-; AVX2-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
@@ -123,21 +123,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm5
-; AVX512-NEXT: vpextrd $3, %xmm1, %eax
-; AVX512-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1
-; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-NEXT: vmovq %xmm4, (%rdx)
+; AVX512-NEXT: vpextrd $3, %xmm1, %r10d
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512-NEXT: vpbroadcastd 16(%rdi), %ymm4
+; AVX512-NEXT: vmovq %xmm1, (%rsi)
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX512-NEXT: vmovq %xmm1, (%rdx)
+; AVX512-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1
; AVX512-NEXT: vmovq %xmm1, (%rcx)
+; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512-NEXT: vmovq %xmm0, (%r8)
-; AVX512-NEXT: vmovq %xmm2, (%r9)
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512-NEXT: vmovq %xmm0, (%r9)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -146,19 +146,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3
; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4
-; AVX512-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
-; AVX512-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT: vmovlps %xmm3, (%rdx)
-; AVX512-FCP-NEXT: vmovlps %xmm4, (%rcx)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0]
+; AVX512-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0]
+; AVX512-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vmovlps %xmm2, (%rcx)
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512-FCP-NEXT: vmovq %xmm0, (%r8)
-; AVX512-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -167,21 +167,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512DQ-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm5
-; AVX512DQ-NEXT: vpextrd $3, %xmm1, %eax
-; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm4, (%rdx)
+; AVX512DQ-NEXT: vpextrd $3, %xmm1, %r10d
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-NEXT: vpbroadcastd 16(%rdi), %ymm4
+; AVX512DQ-NEXT: vmovq %xmm1, (%rsi)
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1
; AVX512DQ-NEXT: vmovq %xmm1, (%rcx)
+; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512DQ-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512DQ-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -190,19 +190,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3
; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
-; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%rcx)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0]
+; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0]
+; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rcx)
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -211,21 +211,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512BW-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm5
-; AVX512BW-NEXT: vpextrd $3, %xmm1, %eax
-; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1
-; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512BW-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512BW-NEXT: vmovq %xmm3, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm4, (%rdx)
+; AVX512BW-NEXT: vpextrd $3, %xmm1, %r10d
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512BW-NEXT: vpbroadcastd 16(%rdi), %ymm4
+; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovq %xmm1, (%rdx)
+; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1
; AVX512BW-NEXT: vmovq %xmm1, (%rcx)
+; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512BW-NEXT: vmovq %xmm0, (%r8)
-; AVX512BW-NEXT: vmovq %xmm2, (%r9)
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512BW-NEXT: vmovq %xmm0, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -234,19 +234,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3
; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4
-; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
-; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%rcx)
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0]
+; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0]
+; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rcx)
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -255,21 +255,21 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpextrd $2, %xmm1, %eax
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
-; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm5
-; AVX512DQ-BW-NEXT: vpextrd $3, %xmm1, %eax
-; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1
-; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-BW-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx)
+; AVX512DQ-BW-NEXT: vpextrd $3, %xmm1, %r10d
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-BW-NEXT: vpbroadcastd 16(%rdi), %ymm4
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm3, %xmm1
; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx)
+; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -278,19 +278,19 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],mem[1],xmm0[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,6,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm3
; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [2,7,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 16(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3]
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,6,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,7,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <10 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index 85ed618..49b1318 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -18,31 +18,31 @@
define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
; SSE-LABEL: load_i32_stride6_vf2:
; SSE: # %bb.0:
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: movdqa 32(%rdi), %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
; SSE-NEXT: movq %xmm1, (%rsi)
-; SSE-NEXT: movq %xmm4, (%rdx)
-; SSE-NEXT: movq %xmm5, (%rcx)
-; SSE-NEXT: movq %xmm6, (%r8)
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq %xmm3, (%rdx)
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT: movq %xmm4, (%rcx)
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; SSE-NEXT: movq %xmm5, (%r8)
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE-NEXT: movq %xmm0, (%r9)
-; SSE-NEXT: movq %xmm7, (%rax)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT: movq %xmm3, (%rax)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i32_stride6_vf2:
@@ -53,22 +53,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovaps 32(%rdi), %xmm2
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX-NEXT: vmovlps %xmm3, (%rsi)
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; AVX-NEXT: vmovlps %xmm3, (%rdx)
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm0[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3]
+; AVX-NEXT: vmovlps %xmm3, (%rcx)
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; AVX-NEXT: vmovlps %xmm3, (%rsi)
-; AVX-NEXT: vmovlps %xmm4, (%rdx)
-; AVX-NEXT: vmovlps %xmm5, (%rcx)
; AVX-NEXT: vmovlps %xmm0, (%r8)
-; AVX-NEXT: vmovlps %xmm6, (%r9)
-; AVX-NEXT: vmovlps %xmm1, (%rax)
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,2,3,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT: vmovlps %xmm0, (%r9)
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm2[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX-NEXT: vmovlps %xmm0, (%rax)
; AVX-NEXT: retq
;
; AVX2-LABEL: load_i32_stride6_vf2:
@@ -80,22 +80,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovaps 32(%rdi), %xmm3
; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
+; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-NEXT: vmovlps %xmm4, (%rsi)
; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
-; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX2-NEXT: vmovlps %xmm2, (%rdx)
+; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX2-NEXT: vmovlps %xmm2, (%rcx)
; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0]
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; AVX2-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX2-NEXT: vpermps %ymm0, %ymm6, %ymm0
-; AVX2-NEXT: vmovlps %xmm4, (%rsi)
-; AVX2-NEXT: vmovlps %xmm2, (%rdx)
-; AVX2-NEXT: vmovlps %xmm5, (%rcx)
; AVX2-NEXT: vmovlps %xmm1, (%r8)
-; AVX2-NEXT: vmovlps %xmm3, (%r9)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1
+; AVX2-NEXT: vmovlps %xmm1, (%r9)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0]
+; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovlps %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -109,22 +109,22 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm3
; AVX2-FP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FP-NEXT: vmovlps %xmm4, (%rsi)
; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
-; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx)
+; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3]
+; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX2-FP-NEXT: vmovlps %xmm2, (%rcx)
; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0]
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FP-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX2-FP-NEXT: vpermps %ymm0, %ymm6, %ymm0
-; AVX2-FP-NEXT: vmovlps %xmm4, (%rsi)
-; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx)
-; AVX2-FP-NEXT: vmovlps %xmm5, (%rcx)
; AVX2-FP-NEXT: vmovlps %xmm1, (%r8)
-; AVX2-FP-NEXT: vmovlps %xmm3, (%r9)
+; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm1
+; AVX2-FP-NEXT: vmovlps %xmm1, (%r9)
+; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0]
+; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vmovlps %xmm0, (%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
@@ -138,54 +138,56 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm3
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
+; AVX2-FCP-NEXT: vmovlps %xmm4, (%rsi)
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3]
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx)
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm1[2,3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%rcx)
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
-; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX2-FCP-NEXT: vpermps %ymm0, %ymm6, %ymm0
-; AVX2-FCP-NEXT: vmovlps %xmm4, (%rsi)
-; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx)
-; AVX2-FCP-NEXT: vmovlps %xmm5, (%rcx)
; AVX2-FCP-NEXT: vmovlps %xmm1, (%r8)
-; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9)
+; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0]
+; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vmovlps %xmm1, (%r9)
+; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm1 = [5,3,0,0]
+; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i32_stride6_vf2:
; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512-NEXT: vextractps $2, %xmm1, %r10d
-; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3
-; AVX512-NEXT: vextractps $3, %xmm1, %r10d
+; AVX512-NEXT: vextractps $3, %xmm1, %r11d
+; AVX512-NEXT: vmovd %xmm2, %ebx
+; AVX512-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1
+; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512-NEXT: vmovaps 32(%rdi), %ymm4
+; AVX512-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512-NEXT: vmovq %xmm1, (%rsi)
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1
-; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512-NEXT: vmovd %xmm2, %r10d
-; AVX512-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4
+; AVX512-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1
+; AVX512-NEXT: vmovq %xmm1, (%rdx)
+; AVX512-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1
+; AVX512-NEXT: vmovq %xmm1, (%rcx)
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
-; AVX512-NEXT: vmovaps 32(%rdi), %ymm5
-; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX512-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-NEXT: vmovq %xmm1, (%rdx)
-; AVX512-NEXT: vmovq %xmm4, (%rcx)
; AVX512-NEXT: vmovq %xmm0, (%r8)
-; AVX512-NEXT: vmovlps %xmm2, (%r9)
-; AVX512-NEXT: vmovlps %xmm5, (%rax)
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512-NEXT: vmovlps %xmm0, (%r9)
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512-NEXT: vmovlps %xmm0, (%rax)
+; AVX512-NEXT: popq %rbx
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -195,56 +197,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm1
; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
-; AVX512-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
-; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
-; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
-; AVX512-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7]
; AVX512-FCP-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512-FCP-NEXT: vmovlps %xmm2, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm6, (%r8)
-; AVX512-FCP-NEXT: vmovlps %xmm4, (%r9)
-; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0]
+; AVX512-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovlps %xmm0, (%rdx)
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4]
+; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0
+; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0]
+; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
+; AVX512-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovlps %xmm0, (%r9)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i32_stride6_vf2:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: pushq %rbx
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512DQ-NEXT: vextractps $2, %xmm1, %r10d
-; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3
-; AVX512DQ-NEXT: vextractps $3, %xmm1, %r10d
+; AVX512DQ-NEXT: vextractps $3, %xmm1, %r11d
+; AVX512DQ-NEXT: vmovd %xmm2, %ebx
+; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm4
+; AVX512DQ-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-NEXT: vmovq %xmm1, (%rsi)
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-NEXT: vmovd %xmm2, %r10d
-; AVX512DQ-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1
+; AVX512DQ-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1
+; AVX512DQ-NEXT: vmovq %xmm1, (%rcx)
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
-; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5
-; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512DQ-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm1, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm4, (%rcx)
; AVX512DQ-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-NEXT: vmovlps %xmm2, (%r9)
-; AVX512DQ-NEXT: vmovlps %xmm5, (%rax)
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512DQ-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovlps %xmm0, (%r9)
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512DQ-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovlps %xmm0, (%rax)
+; AVX512DQ-NEXT: popq %rbx
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -254,56 +258,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
-; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
-; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
-; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r9)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0]
+; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rdx)
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4]
+; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%r9)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512DQ-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i32_stride6_vf2:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbx
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-NEXT: vextractps $2, %xmm1, %r10d
-; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3
-; AVX512BW-NEXT: vextractps $3, %xmm1, %r10d
+; AVX512BW-NEXT: vextractps $3, %xmm1, %r11d
+; AVX512BW-NEXT: vmovd %xmm2, %ebx
+; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1
+; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm4
+; AVX512BW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-NEXT: vmovq %xmm1, (%rsi)
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1
-; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512BW-NEXT: vmovd %xmm2, %r10d
-; AVX512BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4
+; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovq %xmm1, (%rdx)
+; AVX512BW-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1
+; AVX512BW-NEXT: vmovq %xmm1, (%rcx)
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
-; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5
-; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512BW-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX512BW-NEXT: vmovq %xmm3, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm1, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
; AVX512BW-NEXT: vmovq %xmm0, (%r8)
-; AVX512BW-NEXT: vmovlps %xmm2, (%r9)
-; AVX512BW-NEXT: vmovlps %xmm5, (%rax)
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512BW-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT: vmovlps %xmm0, (%r9)
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512BW-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512BW-NEXT: vmovlps %xmm0, (%rax)
+; AVX512BW-NEXT: popq %rbx
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -313,56 +319,58 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm1
; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
-; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
-; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
-; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2
+; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8)
-; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r9)
-; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax)
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0]
+; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rdx)
+; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4]
+; AVX512BW-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%r9)
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i32_stride6_vf2:
; AVX512DQ-BW: # %bb.0:
+; AVX512DQ-BW-NEXT: pushq %rbx
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512DQ-BW-NEXT: vextractps $2, %xmm1, %r10d
-; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3
-; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r10d
+; AVX512DQ-BW-NEXT: vextractps $3, %xmm1, %r11d
+; AVX512DQ-BW-NEXT: vmovd %xmm2, %ebx
+; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm1
+; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm4
+; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rsi)
; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-BW-NEXT: vmovd %xmm2, %r10d
-; AVX512DQ-BW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4
+; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT: vpinsrd $1, %ebx, %xmm3, %xmm1
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx)
; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,2,0,0]
-; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5
-; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm2, %ymm2
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,3,0,0]
-; AVX512DQ-BW-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx)
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%r9)
-; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax)
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512DQ-BW-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r9)
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512DQ-BW-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax)
+; AVX512DQ-BW-NEXT: popq %rbx
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -372,25 +380,25 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,0,0]
; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm1
; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,7,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [2,4,2,4]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
-; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [5,3,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm4 = mem[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [1,7,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [2,4,2,4]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm3, %xmm2, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [7,1,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [4,2,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [5,3,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <12 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index 7948141..64ddca7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -18,35 +18,35 @@
define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
; SSE-LABEL: load_i32_stride7_vf2:
; SSE: # %bb.0:
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT: movdqa (%rdi), %xmm0
-; SSE-NEXT: movdqa 16(%rdi), %xmm1
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: movdqa 16(%rdi), %xmm0
; SSE-NEXT: movdqa 32(%rdi), %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,2,3,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,2,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT: movdqa 48(%rdi), %xmm3
+; SSE-NEXT: movq %xmm1, (%rsi)
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: movdqa 48(%rdi), %xmm2
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
-; SSE-NEXT: movq %xmm0, (%rsi)
; SSE-NEXT: movq %xmm4, (%rdx)
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
; SSE-NEXT: movq %xmm5, (%rcx)
+; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
; SSE-NEXT: movq %xmm6, (%r8)
-; SSE-NEXT: movq %xmm1, (%r9)
-; SSE-NEXT: movq %xmm3, (%r10)
-; SSE-NEXT: movq %xmm7, (%rax)
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,3,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movq %xmm0, (%r9)
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE-NEXT: movq %xmm2, (%rsi)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT: movq %xmm4, (%rax)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i32_stride7_vf2:
@@ -60,26 +60,26 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovaps 32(%rdi), %xmm4
; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
+; AVX-NEXT: vmovlps %xmm5, (%rsi)
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
+; AVX-NEXT: vmovlps %xmm5, (%rdx)
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX-NEXT: vmovlps %xmm5, (%rcx)
; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3]
+; AVX-NEXT: vmovlps %xmm2, (%r8)
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
+; AVX-NEXT: vmovlps %xmm2, (%r9)
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX-NEXT: vmovlps %xmm2, (%r10)
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
-; AVX-NEXT: vmovlps %xmm5, (%rsi)
-; AVX-NEXT: vmovlps %xmm6, (%rdx)
-; AVX-NEXT: vmovlps %xmm7, (%rcx)
-; AVX-NEXT: vmovlps %xmm2, (%r8)
-; AVX-NEXT: vmovlps %xmm3, (%r9)
-; AVX-NEXT: vmovlps %xmm4, (%r10)
; AVX-NEXT: vmovlps %xmm0, (%rax)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -94,27 +94,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovaps (%rdi), %xmm3
; AVX2-NEXT: vmovaps 32(%rdi), %xmm4
; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
-; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm6
-; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
-; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpermps %ymm7, %ymm4, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm5
+; AVX2-NEXT: vmovlps %xmm2, (%rsi)
+; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-NEXT: vmovlps %xmm2, (%rdx)
+; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT: vmovlps %xmm2, (%rcx)
+; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3]
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm2, (%r8)
+; AVX2-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vmovlps %xmm2, (%r9)
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-NEXT: vmovlps %xmm2, (%r10)
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovlps %xmm2, (%rsi)
-; AVX2-NEXT: vmovlps %xmm5, (%rdx)
-; AVX2-NEXT: vmovlps %xmm6, (%rcx)
-; AVX2-NEXT: vmovlps %xmm3, (%r8)
-; AVX2-NEXT: vmovlps %xmm4, (%r9)
-; AVX2-NEXT: vmovlps %xmm7, (%r10)
; AVX2-NEXT: vmovlps %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -129,27 +129,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vmovaps (%rdi), %xmm3
; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4
; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
-; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm6
-; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT: vpermps %ymm7, %ymm4, %ymm4
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm5
+; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi)
+; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-FP-NEXT: vmovlps %xmm2, (%rdx)
+; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vmovlps %xmm2, (%rcx)
+; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3]
+; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX2-FP-NEXT: vmovlps %xmm2, (%r8)
+; AVX2-FP-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2
+; AVX2-FP-NEXT: vmovlps %xmm2, (%r9)
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-FP-NEXT: vmovlps %xmm2, (%r10)
; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi)
-; AVX2-FP-NEXT: vmovlps %xmm5, (%rdx)
-; AVX2-FP-NEXT: vmovlps %xmm6, (%rcx)
-; AVX2-FP-NEXT: vmovlps %xmm3, (%r8)
-; AVX2-FP-NEXT: vmovlps %xmm4, (%r9)
-; AVX2-FP-NEXT: vmovlps %xmm7, (%r10)
; AVX2-FP-NEXT: vmovlps %xmm0, (%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
@@ -164,27 +164,27 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm3
; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3]
-; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm6
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm5
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi)
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%rdx)
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%rcx)
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm3[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%r8)
+; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm2 = [4,3,0,0]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%r9)
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,2,3]
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%r10)
; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi)
-; AVX2-FCP-NEXT: vmovlps %xmm5, (%rdx)
-; AVX2-FCP-NEXT: vmovlps %xmm6, (%rcx)
-; AVX2-FCP-NEXT: vmovlps %xmm3, (%r8)
-; AVX2-FCP-NEXT: vmovlps %xmm4, (%r9)
-; AVX2-FCP-NEXT: vmovlps %xmm7, (%r10)
; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
@@ -195,31 +195,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512-NEXT: vmovd %xmm1, %r11d
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
+; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
+; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0]
+; AVX512-NEXT: vpermps (%rdi), %zmm4, %zmm4
+; AVX512-NEXT: vmovaps 32(%rdi), %ymm5
+; AVX512-NEXT: vmovaps (%rdi), %ymm6
+; AVX512-NEXT: vmovq %xmm2, (%rsi)
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, (%rdx)
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512-NEXT: vmovq %xmm2, (%rcx)
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512-NEXT: vpermps (%rdi), %zmm1, %zmm1
-; AVX512-NEXT: vmovaps (%rdi), %ymm5
-; AVX512-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX512-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-NEXT: vmovq %xmm3, (%rdx)
-; AVX512-NEXT: vmovq %xmm4, (%rcx)
; AVX512-NEXT: vmovq %xmm0, (%r8)
-; AVX512-NEXT: vmovlps %xmm1, (%r9)
-; AVX512-NEXT: vmovlps %xmm7, (%r10)
-; AVX512-NEXT: vmovlps %xmm5, (%rax)
+; AVX512-NEXT: vmovlps %xmm4, (%r9)
+; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512-NEXT: vmovlps %xmm0, (%r10)
+; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7]
+; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovlps %xmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -231,24 +231,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
-; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
-; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4
+; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
+; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3
+; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0]
+; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3
+; AVX512-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2
-; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
-; AVX512-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512-FCP-NEXT: vmovlps %xmm1, (%r9)
-; AVX512-FCP-NEXT: vmovlps %xmm2, (%r10)
+; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vmovlps %xmm1, (%r10)
+; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -259,31 +259,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512DQ-NEXT: vmovd %xmm1, %r11d
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
+; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
+; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0]
+; AVX512DQ-NEXT: vpermps (%rdi), %zmm4, %zmm4
+; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm5
+; AVX512DQ-NEXT: vmovaps (%rdi), %ymm6
+; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-NEXT: vmovq %xmm2, (%rcx)
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-NEXT: vpermps (%rdi), %zmm1, %zmm1
-; AVX512DQ-NEXT: vmovaps (%rdi), %ymm5
-; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX512DQ-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX512DQ-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm4, (%rcx)
; AVX512DQ-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-NEXT: vmovlps %xmm1, (%r9)
-; AVX512DQ-NEXT: vmovlps %xmm7, (%r10)
-; AVX512DQ-NEXT: vmovlps %xmm5, (%rax)
+; AVX512DQ-NEXT: vmovlps %xmm4, (%r9)
+; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512DQ-NEXT: vmovlps %xmm0, (%r10)
+; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -295,24 +295,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512DQ-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
+; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
-; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r9)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm2, (%r10)
+; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r10)
+; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -323,31 +323,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512BW-NEXT: vmovd %xmm1, %r11d
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
+; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
+; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0]
+; AVX512BW-NEXT: vpermps (%rdi), %zmm4, %zmm4
+; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm5
+; AVX512BW-NEXT: vmovaps (%rdi), %ymm6
+; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512BW-NEXT: vmovq %xmm2, (%rcx)
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512BW-NEXT: vpermps (%rdi), %zmm1, %zmm1
-; AVX512BW-NEXT: vmovaps (%rdi), %ymm5
-; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX512BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512BW-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX512BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
; AVX512BW-NEXT: vmovq %xmm0, (%r8)
-; AVX512BW-NEXT: vmovlps %xmm1, (%r9)
-; AVX512BW-NEXT: vmovlps %xmm7, (%r10)
-; AVX512BW-NEXT: vmovlps %xmm5, (%rax)
+; AVX512BW-NEXT: vmovlps %xmm4, (%r9)
+; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512BW-NEXT: vmovlps %xmm0, (%r10)
+; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7]
+; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovlps %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -359,24 +359,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3
-; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
-; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4
+; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
+; AVX512BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3
+; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3
+; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
-; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r9)
-; AVX512BW-FCP-NEXT: vmovlps %xmm2, (%r10)
+; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r10)
+; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -387,31 +387,31 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512DQ-BW-NEXT: vmovd %xmm1, %r11d
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
+; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
+; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm3
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,11,0,0]
+; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm4, %zmm4
+; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm5
+; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm6
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm1[1],xmm3[2,3]
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx)
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm1, %zmm1
-; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm5
-; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx)
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r9)
-; AVX512DQ-BW-NEXT: vmovlps %xmm7, (%r10)
-; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax)
+; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r9)
+; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r10)
+; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -423,24 +423,24 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512DQ-BW-FCP-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,4,1,4]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,2,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm2, %xmm1, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm2[1],xmm4[2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [7,2,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm7, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm2, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index 13410fb..a118b40 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -27,22 +27,22 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa 48(%rdi), %xmm3
; SSE-NEXT: movdqa %xmm0, %xmm4
; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; SSE-NEXT: movdqa %xmm1, %xmm6
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
-; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
; SSE-NEXT: movq %xmm4, (%rsi)
-; SSE-NEXT: movq %xmm5, (%rdx)
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE-NEXT: movq %xmm4, (%rdx)
+; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
; SSE-NEXT: movq %xmm0, (%rcx)
-; SSE-NEXT: movq %xmm2, (%r8)
-; SSE-NEXT: movq %xmm6, (%r9)
-; SSE-NEXT: movq %xmm7, (%r11)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: movq %xmm0, (%r8)
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE-NEXT: movq %xmm0, (%r9)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE-NEXT: movq %xmm0, (%r11)
+; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; SSE-NEXT: movq %xmm1, (%r10)
-; SSE-NEXT: movq %xmm3, (%rax)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE-NEXT: movq %xmm0, (%rax)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i32_stride8_vf2:
@@ -55,26 +55,26 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX-NEXT: vmovdqa (%rdi), %xmm3
; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%rsi)
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%rdx)
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3]
-; AVX-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX-NEXT: vmovq %xmm2, (%rcx)
+; AVX-NEXT: vpextrq $1, %xmm2, (%r8)
+; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT: vmovlps %xmm2, (%r9)
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3]
+; AVX-NEXT: vmovlps %xmm2, (%r11)
+; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT: vmovlps %xmm2, (%r10)
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
-; AVX-NEXT: vmovq %xmm4, (%rsi)
-; AVX-NEXT: vmovq %xmm5, (%rdx)
-; AVX-NEXT: vmovq %xmm2, (%rcx)
-; AVX-NEXT: vpextrq $1, %xmm2, (%r8)
-; AVX-NEXT: vmovlps %xmm3, (%r9)
-; AVX-NEXT: vmovlps %xmm6, (%r11)
-; AVX-NEXT: vmovlps %xmm7, (%r10)
; AVX-NEXT: vmovlps %xmm0, (%rax)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -84,30 +84,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
-; AVX2-NEXT: vmovaps (%rdi), %ymm1
+; AVX2-NEXT: vmovaps (%rdi), %ymm0
+; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
-; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
-; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm4, (%rsi)
-; AVX2-NEXT: vmovq %xmm5, (%rdx)
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-NEXT: vmovq %xmm2, (%rcx)
; AVX2-NEXT: vpextrq $1, %xmm2, (%r8)
-; AVX2-NEXT: vmovlps %xmm3, (%r9)
-; AVX2-NEXT: vmovlps %xmm6, (%r11)
+; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX2-NEXT: vmovlps %xmm2, (%r9)
+; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5]
+; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
+; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX2-NEXT: vmovlps %xmm2, (%r11)
+; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovlps %xmm1, (%r10)
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovlps %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -117,30 +117,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0
-; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1
+; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0
+; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
-; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
-; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
-; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-FP-NEXT: vmovq %xmm4, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm5, (%rdx)
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-FP-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FP-NEXT: vmovq %xmm2, (%rcx)
; AVX2-FP-NEXT: vpextrq $1, %xmm2, (%r8)
-; AVX2-FP-NEXT: vmovlps %xmm3, (%r9)
-; AVX2-FP-NEXT: vmovlps %xmm6, (%r11)
+; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT: vmovlps %xmm2, (%r9)
+; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5]
+; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
+; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT: vmovlps %xmm2, (%r11)
+; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-FP-NEXT: vmovlps %xmm1, (%r10)
+; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-FP-NEXT: vmovlps %xmm0, (%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
@@ -150,30 +150,30 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
-; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
-; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
-; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx)
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,1,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx)
; AVX2-FCP-NEXT: vpextrq $1, %xmm2, (%r8)
-; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9)
-; AVX2-FCP-NEXT: vmovlps %xmm6, (%r11)
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%r9)
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm0[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX2-FCP-NEXT: vmovlps %xmm2, (%r11)
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-FCP-NEXT: vmovlps %xmm1, (%r10)
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
@@ -186,28 +186,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX512-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512-NEXT: vmovaps (%rdi), %ymm4
-; AVX512-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
-; AVX512-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
-; AVX512-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX512-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX512-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-NEXT: vmovq %xmm3, (%rdx)
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512-NEXT: vmovq %xmm2, (%rdx)
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-NEXT: vmovq %xmm0, (%rcx)
; AVX512-NEXT: vpextrq $1, %xmm0, (%r8)
-; AVX512-NEXT: vmovlps %xmm5, (%r9)
-; AVX512-NEXT: vmovlps %xmm6, (%r11)
-; AVX512-NEXT: vmovlps %xmm4, (%r10)
-; AVX512-NEXT: vmovlps %xmm1, (%rax)
+; AVX512-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovlps %xmm0, (%r9)
+; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovlps %xmm0, (%r11)
+; AVX512-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vmovlps %xmm1, (%r10)
+; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vmovlps %xmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -219,27 +219,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm1
-; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm4
-; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
-; AVX512-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6
-; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
+; AVX512-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3
+; AVX512-FCP-NEXT: vmovaps 32(%rdi), %ymm4
+; AVX512-FCP-NEXT: vmovaps (%rdi), %ymm5
; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0]
+; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
-; AVX512-FCP-NEXT: vmovlps %xmm5, (%r9)
-; AVX512-FCP-NEXT: vmovlps %xmm6, (%r11)
-; AVX512-FCP-NEXT: vmovlps %xmm4, (%r10)
-; AVX512-FCP-NEXT: vmovlps %xmm1, (%rax)
+; AVX512-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-FCP-NEXT: vmovlps %xmm0, (%r9)
+; AVX512-FCP-NEXT: vmovlps %xmm3, (%r11)
+; AVX512-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT: vmovlps %xmm1, (%r10)
+; AVX512-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -251,28 +251,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512DQ-NEXT: vmovaps (%rdi), %ymm4
-; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
-; AVX512DQ-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512DQ-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-NEXT: vmovlps %xmm5, (%r9)
-; AVX512DQ-NEXT: vmovlps %xmm6, (%r11)
-; AVX512DQ-NEXT: vmovlps %xmm4, (%r10)
-; AVX512DQ-NEXT: vmovlps %xmm1, (%rax)
+; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
+; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT: vmovlps %xmm0, (%r9)
+; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT: vmovlps %xmm0, (%r11)
+; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
+; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vmovlps %xmm1, (%r10)
+; AVX512DQ-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -284,27 +284,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
-; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
+; AVX512DQ-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %ymm5
; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm5, (%r9)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm6, (%r11)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm4, (%r10)
-; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%rax)
+; AVX512DQ-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%r9)
+; AVX512DQ-FCP-NEXT: vmovlps %xmm3, (%r11)
+; AVX512DQ-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vmovlps %xmm1, (%r10)
+; AVX512DQ-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -316,28 +316,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512BW-NEXT: vmovaps (%rdi), %ymm4
-; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
-; AVX512BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
-; AVX512BW-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8)
-; AVX512BW-NEXT: vmovlps %xmm5, (%r9)
-; AVX512BW-NEXT: vmovlps %xmm6, (%r11)
-; AVX512BW-NEXT: vmovlps %xmm4, (%r10)
-; AVX512BW-NEXT: vmovlps %xmm1, (%rax)
+; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovlps %xmm0, (%r9)
+; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovlps %xmm0, (%r11)
+; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vmovlps %xmm1, (%r10)
+; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT: vmovlps %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -349,27 +349,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm4
-; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
-; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6
-; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
+; AVX512BW-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3
+; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %ymm4
+; AVX512BW-FCP-NEXT: vmovaps (%rdi), %ymm5
; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
-; AVX512BW-FCP-NEXT: vmovlps %xmm5, (%r9)
-; AVX512BW-FCP-NEXT: vmovlps %xmm6, (%r11)
-; AVX512BW-FCP-NEXT: vmovlps %xmm4, (%r10)
-; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%rax)
+; AVX512BW-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%r9)
+; AVX512BW-FCP-NEXT: vmovlps %xmm3, (%r11)
+; AVX512BW-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512BW-FCP-NEXT: vmovlps %xmm1, (%r10)
+; AVX512BW-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512BW-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -381,28 +381,28 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm4
-; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
-; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%r9)
-; AVX512DQ-BW-NEXT: vmovlps %xmm6, (%r11)
-; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r10)
-; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rax)
+; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
+; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r9)
+; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,1,1,1,5,5,5,5]
+; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%r11)
+; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
+; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r10)
+; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -414,27 +414,27 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
-; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm6, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpermps (%rdi), %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %ymm5
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,5,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm6, (%r11)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm4, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm5[0],ymm4[0],ymm5[1],ymm4[1],ymm5[4],ymm4[4],ymm5[5],ymm4[5]
+; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm3, (%r11)
+; AVX512DQ-BW-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7]
+; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm1, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512DQ-BW-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <16 x i32>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
index 81fe19c..b609299 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
@@ -280,9 +280,9 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: packuswb %xmm1, %xmm1
+; SSE-NEXT: movq %xmm1, (%rsi)
; SSE-NEXT: psrlw $8, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movq %xmm1, (%rsi)
; SSE-NEXT: movq %xmm0, (%rdx)
; SSE-NEXT: retq
;
@@ -290,8 +290,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovq %xmm1, (%rsi)
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX-NEXT: vmovq %xmm0, (%rdx)
; AVX-NEXT: retq
;
@@ -299,8 +299,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovq %xmm1, (%rsi)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovq %xmm0, (%rdx)
; AVX2-NEXT: retq
;
@@ -308,8 +308,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vmovq %xmm1, (%rsi)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vmovq %xmm0, (%rdx)
; AVX2-FP-NEXT: retq
;
@@ -317,8 +317,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vmovq %xmm1, (%rsi)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX2-FCP-NEXT: retq
;
@@ -326,8 +326,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vmovq %xmm1, (%rsi)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vmovq %xmm0, (%rdx)
; AVX512-NEXT: retq
;
@@ -335,8 +335,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vmovq %xmm1, (%rsi)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX512-FCP-NEXT: retq
;
@@ -344,8 +344,8 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vmovq %xmm1, (%rsi)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vmovq %xmm0, (%rdx)
; AVX512DQ-NEXT: retq
;
@@ -353,41 +353,41 @@ define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) noun
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rsi)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i8_stride2_vf8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm1, (%rdx)
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i8_stride2_vf8:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rdx)
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i8_stride2_vf8:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i8_stride2_vf8:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <16 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <16 x i8> %wide.vec, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index d1d7cb0..a238371 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -378,39 +378,39 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
; SSE-NEXT: packuswb %xmm4, %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
-; SSE-NEXT: movdqa %xmm0, %xmm6
-; SSE-NEXT: pand %xmm5, %xmm6
-; SSE-NEXT: pandn %xmm1, %xmm5
-; SSE-NEXT: por %xmm6, %xmm5
-; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: pand %xmm6, %xmm5
-; SSE-NEXT: pandn %xmm3, %xmm6
-; SSE-NEXT: por %xmm5, %xmm6
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; SSE-NEXT: packuswb %xmm5, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm6, %xmm0
-; SSE-NEXT: pandn %xmm1, %xmm6
-; SSE-NEXT: por %xmm0, %xmm6
-; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE-NEXT: movq %xmm4, (%rsi)
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pand %xmm4, %xmm5
+; SSE-NEXT: pandn %xmm1, %xmm4
+; SSE-NEXT: por %xmm5, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: pand %xmm5, %xmm4
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[2,1,0,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; SSE-NEXT: packuswb %xmm4, %xmm4
+; SSE-NEXT: movq %xmm4, (%rdx)
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm4, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm4
+; SSE-NEXT: por %xmm0, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
-; SSE-NEXT: pand %xmm0, %xmm6
+; SSE-NEXT: pand %xmm0, %xmm4
; SSE-NEXT: pandn %xmm3, %xmm0
-; SSE-NEXT: por %xmm6, %xmm0
+; SSE-NEXT: por %xmm4, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movq %xmm4, (%rsi)
-; SSE-NEXT: movq %xmm5, (%rdx)
; SSE-NEXT: movq %xmm0, (%rcx)
; SSE-NEXT: retq
;
@@ -421,14 +421,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vmovq %xmm2, (%rsi)
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vmovq %xmm2, (%rdx)
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm2, (%rsi)
-; AVX-NEXT: vmovq %xmm3, (%rdx)
; AVX-NEXT: vmovq %xmm0, (%rcx)
; AVX-NEXT: retq
;
@@ -439,14 +439,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%rdx)
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-NEXT: vmovq %xmm3, (%rdx)
; AVX2-NEXT: vmovq %xmm0, (%rcx)
; AVX2-NEXT: retq
;
@@ -457,14 +457,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-FP-NEXT: vmovq %xmm2, (%rdx)
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
; AVX2-FP-NEXT: vmovq %xmm0, (%rcx)
; AVX2-FP-NEXT: retq
;
@@ -475,14 +475,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX2-FCP-NEXT: retq
;
@@ -493,14 +493,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovq %xmm2, (%rsi)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vmovq %xmm2, (%rdx)
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-NEXT: vmovq %xmm3, (%rdx)
; AVX512-NEXT: vmovq %xmm0, (%rcx)
; AVX512-NEXT: retq
;
@@ -511,14 +511,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512-FCP-NEXT: retq
;
@@ -529,14 +529,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-NEXT: retq
;
@@ -547,14 +547,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-FCP-NEXT: retq
;
@@ -565,14 +565,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-NEXT: retq
;
@@ -583,14 +583,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-FCP-NEXT: retq
;
@@ -601,14 +601,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-BW-NEXT: retq
;
@@ -619,14 +619,14 @@ define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <24 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index abef980..1dff9f4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -409,62 +409,62 @@ define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
; SSE-LABEL: load_i8_stride4_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm3
+; SSE-NEXT: movdqa (%rdi), %xmm2
; SSE-NEXT: movdqa 16(%rdi), %xmm4
-; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT: movdqa %xmm4, %xmm0
; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: movdqa %xmm4, %xmm2
; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pand %xmm0, %xmm4
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm7, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
-; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pand %xmm7, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7]
-; SSE-NEXT: packuswb %xmm5, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,3]
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: packuswb %xmm4, %xmm3
+; SSE-NEXT: packuswb %xmm3, %xmm3
+; SSE-NEXT: movq %xmm3, (%rsi)
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; SSE-NEXT: packuswb %xmm6, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; SSE-NEXT: movq %xmm4, (%rdx)
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm5, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
+; SSE-NEXT: packuswb %xmm4, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3]
+; SSE-NEXT: movq %xmm4, (%rcx)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE-NEXT: packuswb %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3]
-; SSE-NEXT: movq %xmm0, (%rsi)
-; SSE-NEXT: movq %xmm6, (%rdx)
-; SSE-NEXT: movq %xmm5, (%rcx)
-; SSE-NEXT: movq %xmm1, (%r8)
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT: packuswb %xmm1, %xmm2
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
+; SSE-NEXT: movq %xmm0, (%r8)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i8_stride4_vf8:
@@ -475,22 +475,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: vmovq %xmm3, (%rdx)
-; AVX-NEXT: vmovq %xmm4, (%rcx)
-; AVX-NEXT: vmovq %xmm1, (%r8)
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX-NEXT: vmovq %xmm0, (%rdx)
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX-NEXT: vmovq %xmm0, (%rcx)
+; AVX-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX-NEXT: vmovq %xmm0, (%r8)
; AVX-NEXT: retq
;
; AVX2-LABEL: load_i8_stride4_vf8:
@@ -501,22 +501,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-NEXT: vmovq %xmm1, (%r8)
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vmovq %xmm0, (%rdx)
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT: vmovq %xmm0, (%rcx)
+; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT: vmovq %xmm0, (%r8)
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i8_stride4_vf8:
@@ -527,22 +527,22 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-FP-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm1, (%r8)
+; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FP-NEXT: vmovq %xmm0, (%rdx)
+; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FP-NEXT: vmovq %xmm0, (%rcx)
+; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-FP-NEXT: vmovq %xmm0, (%r8)
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i8_stride4_vf8:
@@ -553,125 +553,125 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm1, (%r8)
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx)
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%r8)
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i8_stride4_vf8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-NEXT: vpsrld $8, %ymm0, %ymm1
-; AVX512-NEXT: vpsrld $16, %ymm0, %ymm2
-; AVX512-NEXT: vpsrld $24, %ymm0, %ymm3
; AVX512-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512-NEXT: vpsrld $8, %ymm0, %ymm1
; AVX512-NEXT: vpmovdb %ymm1, (%rdx)
-; AVX512-NEXT: vpmovdb %ymm2, (%rcx)
-; AVX512-NEXT: vpmovdb %ymm3, (%r8)
+; AVX512-NEXT: vpsrld $16, %ymm0, %ymm1
+; AVX512-NEXT: vpmovdb %ymm1, (%rcx)
+; AVX512-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdb %ymm0, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i8_stride4_vf8:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT: vpsrld $8, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpsrld $24, %ymm0, %ymm3
; AVX512-FCP-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512-FCP-NEXT: vpsrld $8, %ymm0, %ymm1
; AVX512-FCP-NEXT: vpmovdb %ymm1, (%rdx)
-; AVX512-FCP-NEXT: vpmovdb %ymm2, (%rcx)
-; AVX512-FCP-NEXT: vpmovdb %ymm3, (%r8)
+; AVX512-FCP-NEXT: vpsrld $16, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpmovdb %ymm1, (%rcx)
+; AVX512-FCP-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpmovdb %ymm0, (%r8)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i8_stride4_vf8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vpsrld $8, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpsrld $24, %ymm0, %ymm3
; AVX512DQ-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512DQ-NEXT: vpsrld $8, %ymm0, %ymm1
; AVX512DQ-NEXT: vpmovdb %ymm1, (%rdx)
-; AVX512DQ-NEXT: vpmovdb %ymm2, (%rcx)
-; AVX512DQ-NEXT: vpmovdb %ymm3, (%r8)
+; AVX512DQ-NEXT: vpsrld $16, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpmovdb %ymm1, (%rcx)
+; AVX512DQ-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovdb %ymm0, (%r8)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i8_stride4_vf8:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vpsrld $8, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpsrld $24, %ymm0, %ymm3
; AVX512DQ-FCP-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vpsrld $8, %ymm0, %ymm1
; AVX512DQ-FCP-NEXT: vpmovdb %ymm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vpmovdb %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT: vpmovdb %ymm3, (%r8)
+; AVX512DQ-FCP-NEXT: vpsrld $16, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovdb %ymm1, (%rcx)
+; AVX512DQ-FCP-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovdb %ymm0, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i8_stride4_vf8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm1
-; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm2
-; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm3
; AVX512BW-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512BW-NEXT: vpsrld $8, %ymm0, %ymm1
; AVX512BW-NEXT: vpmovdb %ymm1, (%rdx)
-; AVX512BW-NEXT: vpmovdb %ymm2, (%rcx)
-; AVX512BW-NEXT: vpmovdb %ymm3, (%r8)
+; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm1
+; AVX512BW-NEXT: vpmovdb %ymm1, (%rcx)
+; AVX512BW-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovdb %ymm0, (%r8)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i8_stride4_vf8:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1
-; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm3
; AVX512BW-FCP-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1
; AVX512BW-FCP-NEXT: vpmovdb %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT: vpmovdb %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT: vpmovdb %ymm3, (%r8)
+; AVX512BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT: vpmovdb %ymm1, (%rcx)
+; AVX512BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpmovdb %ymm0, (%r8)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i8_stride4_vf8:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-BW-NEXT: vpsrld $8, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vpsrld $24, %ymm0, %ymm3
; AVX512DQ-BW-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512DQ-BW-NEXT: vpsrld $8, %ymm0, %ymm1
; AVX512DQ-BW-NEXT: vpmovdb %ymm1, (%rdx)
-; AVX512DQ-BW-NEXT: vpmovdb %ymm2, (%rcx)
-; AVX512DQ-BW-NEXT: vpmovdb %ymm3, (%r8)
+; AVX512DQ-BW-NEXT: vpsrld $16, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT: vpmovdb %ymm1, (%rcx)
+; AVX512DQ-BW-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vpmovdb %ymm0, (%r8)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i8_stride4_vf8:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm3
; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vpsrld $8, %ymm0, %ymm1
; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm3, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %ymm0, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpsrld $24, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpmovdb %ymm0, (%r8)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <32 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index ac14f55..5db006e5d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -583,133 +583,133 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
; SSE-LABEL: load_i8_stride5_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm4
-; SSE-NEXT: movdqa 16(%rdi), %xmm3
+; SSE-NEXT: movdqa (%rdi), %xmm3
+; SSE-NEXT: movdqa 16(%rdi), %xmm2
; SSE-NEXT: movdqa 32(%rdi), %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pandn %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: movdqa %xmm1, %xmm4
+; SSE-NEXT: pandn %xmm2, %xmm4
+; SSE-NEXT: movdqa %xmm3, %xmm5
; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: por %xmm2, %xmm5
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: movdqa %xmm5, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE-NEXT: por %xmm4, %xmm5
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: movdqa %xmm5, %xmm6
+; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
; SSE-NEXT: pand %xmm7, %xmm5
-; SSE-NEXT: pandn %xmm2, %xmm7
+; SSE-NEXT: pandn %xmm6, %xmm7
; SSE-NEXT: por %xmm5, %xmm7
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,6,5,7]
+; SSE-NEXT: packuswb %xmm6, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,1,1]
+; SSE-NEXT: movdqa %xmm5, %xmm8
+; SSE-NEXT: pandn %xmm7, %xmm8
+; SSE-NEXT: por %xmm6, %xmm8
+; SSE-NEXT: movq %xmm8, (%rsi)
+; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: pand %xmm6, %xmm7
+; SSE-NEXT: pandn %xmm3, %xmm6
+; SSE-NEXT: por %xmm7, %xmm6
+; SSE-NEXT: movdqa %xmm6, %xmm7
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,65535,65535,65535,0]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE-NEXT: pand %xmm8, %xmm6
+; SSE-NEXT: pandn %xmm7, %xmm8
+; SSE-NEXT: por %xmm6, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[0,2,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,5]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,5,7]
+; SSE-NEXT: packuswb %xmm6, %xmm6
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: movdqa %xmm0, %xmm7
+; SSE-NEXT: pslld $24, %xmm7
+; SSE-NEXT: pandn %xmm7, %xmm5
+; SSE-NEXT: por %xmm6, %xmm5
+; SSE-NEXT: movq %xmm5, (%rdx)
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
+; SSE-NEXT: movdqa %xmm2, %xmm6
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: pandn %xmm3, %xmm5
+; SSE-NEXT: por %xmm6, %xmm5
+; SSE-NEXT: movdqa %xmm5, %xmm7
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: pandn %xmm7, %xmm8
+; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE-NEXT: pand %xmm6, %xmm5
+; SSE-NEXT: por %xmm8, %xmm5
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,6,5,6,7]
; SSE-NEXT: packuswb %xmm7, %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm2, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: pandn %xmm8, %xmm5
-; SSE-NEXT: por %xmm7, %xmm5
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; SSE-NEXT: movdqa %xmm3, %xmm8
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: pand %xmm5, %xmm7
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
+; SSE-NEXT: packuswb %xmm8, %xmm8
+; SSE-NEXT: movdqa %xmm5, %xmm9
+; SSE-NEXT: pandn %xmm8, %xmm9
+; SSE-NEXT: por %xmm7, %xmm9
+; SSE-NEXT: movq %xmm9, (%rcx)
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
+; SSE-NEXT: movdqa %xmm2, %xmm8
; SSE-NEXT: pand %xmm7, %xmm8
-; SSE-NEXT: pandn %xmm4, %xmm7
+; SSE-NEXT: pandn %xmm3, %xmm7
; SSE-NEXT: por %xmm8, %xmm7
; SSE-NEXT: movdqa %xmm7, %xmm8
-; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE-NEXT: pand %xmm9, %xmm7
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm8[2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5]
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7]
+; SSE-NEXT: packuswb %xmm7, %xmm7
+; SSE-NEXT: pand %xmm5, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,6]
+; SSE-NEXT: packuswb %xmm8, %xmm8
+; SSE-NEXT: movdqa %xmm5, %xmm9
; SSE-NEXT: pandn %xmm8, %xmm9
; SSE-NEXT: por %xmm7, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,2,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7]
-; SSE-NEXT: packuswb %xmm7, %xmm7
-; SSE-NEXT: pand %xmm2, %xmm7
-; SSE-NEXT: movdqa %xmm0, %xmm8
-; SSE-NEXT: pslld $24, %xmm8
-; SSE-NEXT: pandn %xmm8, %xmm2
-; SSE-NEXT: por %xmm7, %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
-; SSE-NEXT: movdqa %xmm3, %xmm8
-; SSE-NEXT: pand %xmm7, %xmm8
-; SSE-NEXT: pandn %xmm4, %xmm7
-; SSE-NEXT: por %xmm8, %xmm7
-; SSE-NEXT: movdqa %xmm7, %xmm9
-; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm8, %xmm10
-; SSE-NEXT: pandn %xmm9, %xmm10
-; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: por %xmm10, %xmm7
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,7]
-; SSE-NEXT: packuswb %xmm10, %xmm10
-; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pand %xmm7, %xmm10
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5]
-; SSE-NEXT: packuswb %xmm11, %xmm11
-; SSE-NEXT: movdqa %xmm7, %xmm9
-; SSE-NEXT: pandn %xmm11, %xmm9
-; SSE-NEXT: por %xmm10, %xmm9
-; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; SSE-NEXT: movdqa %xmm3, %xmm11
-; SSE-NEXT: pand %xmm10, %xmm11
-; SSE-NEXT: pandn %xmm4, %xmm10
-; SSE-NEXT: por %xmm11, %xmm10
-; SSE-NEXT: movdqa %xmm10, %xmm11
-; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
-; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7]
-; SSE-NEXT: packuswb %xmm11, %xmm11
-; SSE-NEXT: pand %xmm7, %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6]
-; SSE-NEXT: packuswb %xmm12, %xmm12
-; SSE-NEXT: movdqa %xmm7, %xmm10
-; SSE-NEXT: pandn %xmm12, %xmm10
-; SSE-NEXT: por %xmm11, %xmm10
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pandn %xmm4, %xmm1
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pandn %xmm3, %xmm8
-; SSE-NEXT: por %xmm1, %xmm8
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7]
+; SSE-NEXT: movq %xmm9, (%r8)
+; SSE-NEXT: pand %xmm1, %xmm2
+; SSE-NEXT: pandn %xmm3, %xmm1
+; SSE-NEXT: por %xmm2, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE-NEXT: pand %xmm6, %xmm1
+; SSE-NEXT: pandn %xmm2, %xmm6
+; SSE-NEXT: por %xmm1, %xmm6
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,0,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: pand %xmm7, %xmm1
+; SSE-NEXT: pand %xmm5, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pandn %xmm0, %xmm7
-; SSE-NEXT: por %xmm1, %xmm7
-; SSE-NEXT: movq %xmm5, (%rsi)
-; SSE-NEXT: movq %xmm2, (%rdx)
-; SSE-NEXT: movq %xmm9, (%rcx)
-; SSE-NEXT: movq %xmm10, (%r8)
-; SSE-NEXT: movq %xmm7, (%r9)
+; SSE-NEXT: pandn %xmm0, %xmm5
+; SSE-NEXT: por %xmm1, %xmm5
+; SSE-NEXT: movq %xmm5, (%r9)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i8_stride5_vf8:
@@ -722,30 +722,30 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,7,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm3, (%rsi)
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm3, (%rdx)
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm3, (%rcx)
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vmovq %xmm3, (%r8)
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,5,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vmovq %xmm3, (%rsi)
-; AVX-NEXT: vmovq %xmm4, (%rdx)
-; AVX-NEXT: vmovq %xmm5, (%rcx)
-; AVX-NEXT: vmovq %xmm6, (%r8)
; AVX-NEXT: vmovq %xmm0, (%r9)
; AVX-NEXT: retq
;
@@ -758,26 +758,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX2-NEXT: vmovq %xmm3, (%rsi)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vmovq %xmm3, (%rcx)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-NEXT: vmovq %xmm3, (%r8)
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-NEXT: vmovq %xmm5, (%rcx)
-; AVX2-NEXT: vmovq %xmm6, (%r8)
; AVX2-NEXT: vmovq %xmm0, (%r9)
; AVX2-NEXT: retq
;
@@ -790,26 +790,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT: vmovq %xmm3, (%rcx)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-FP-NEXT: vmovq %xmm3, (%r8)
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm5, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm6, (%r8)
; AVX2-FP-NEXT: vmovq %xmm0, (%r9)
; AVX2-FP-NEXT: retq
;
@@ -822,26 +822,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX2-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX2-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX2-FCP-NEXT: retq
;
@@ -854,26 +854,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512-NEXT: vmovq %xmm3, (%rsi)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovq %xmm3, (%rdx)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovq %xmm3, (%rcx)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vmovq %xmm3, (%r8)
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-NEXT: vmovq %xmm4, (%rdx)
-; AVX512-NEXT: vmovq %xmm5, (%rcx)
-; AVX512-NEXT: vmovq %xmm6, (%r8)
; AVX512-NEXT: vmovq %xmm0, (%r9)
; AVX512-NEXT: retq
;
@@ -886,26 +886,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512-FCP-NEXT: retq
;
@@ -918,26 +918,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT: vmovq %xmm3, (%rcx)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT: vmovq %xmm3, (%r8)
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm4, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-NEXT: retq
;
@@ -950,26 +950,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-FCP-NEXT: retq
;
@@ -982,26 +982,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512BW-NEXT: vmovq %xmm3, (%rsi)
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT: vmovq %xmm3, (%r8)
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm3, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm4, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm5, (%rcx)
-; AVX512BW-NEXT: vmovq %xmm6, (%r8)
; AVX512BW-NEXT: vmovq %xmm0, (%r9)
; AVX512BW-NEXT: retq
;
@@ -1014,26 +1014,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512BW-FCP-NEXT: retq
;
@@ -1046,26 +1046,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT: vmovq %xmm3, (%r8)
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-BW-NEXT: retq
;
@@ -1078,26 +1078,26 @@ define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,6,11],zero,zero,zero,zero,xmm4[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,7,12],zero,zero,zero,xmm4[0,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,8,13],zero,zero,zero,xmm4[1,6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <40 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index f87126a..763b8a6 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -755,146 +755,146 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-LABEL: load_i8_stride6_vf8:
; SSE: # %bb.0:
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movdqa (%rdi), %xmm4
-; SSE-NEXT: movdqa 16(%rdi), %xmm3
+; SSE-NEXT: movdqa (%rdi), %xmm2
+; SSE-NEXT: movdqa 16(%rdi), %xmm1
; SSE-NEXT: movdqa 32(%rdi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT: movdqa %xmm4, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pandn %xmm3, %xmm8
-; SSE-NEXT: por %xmm1, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3]
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935]
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7]
-; SSE-NEXT: packuswb %xmm6, %xmm6
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pand %xmm1, %xmm6
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: pand %xmm5, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5]
+; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT: movdqa %xmm2, %xmm3
+; SSE-NEXT: pand %xmm6, %xmm3
+; SSE-NEXT: pandn %xmm1, %xmm6
+; SSE-NEXT: por %xmm3, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,1,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [16711935,16711935,16711935,16711935]
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
+; SSE-NEXT: packuswb %xmm4, %xmm4
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: pand %xmm7, %xmm4
+; SSE-NEXT: movdqa %xmm0, %xmm5
+; SSE-NEXT: pand %xmm3, %xmm5
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,1,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
+; SSE-NEXT: packuswb %xmm8, %xmm8
+; SSE-NEXT: movdqa %xmm7, %xmm9
+; SSE-NEXT: pandn %xmm8, %xmm9
+; SSE-NEXT: por %xmm4, %xmm9
+; SSE-NEXT: movq %xmm9, (%rsi)
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: movdqa %xmm6, %xmm8
+; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,0,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,0,0,65535,65535]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,0,4,5,6,7]
+; SSE-NEXT: pand %xmm9, %xmm6
+; SSE-NEXT: pandn %xmm8, %xmm9
+; SSE-NEXT: por %xmm6, %xmm9
; SSE-NEXT: packuswb %xmm9, %xmm9
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: pandn %xmm9, %xmm2
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: movdqa %xmm8, %xmm9
-; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7]
-; SSE-NEXT: pand %xmm10, %xmm8
-; SSE-NEXT: pandn %xmm9, %xmm10
-; SSE-NEXT: por %xmm8, %xmm10
+; SSE-NEXT: pand %xmm7, %xmm9
+; SSE-NEXT: movdqa %xmm0, %xmm6
+; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,3,3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; SSE-NEXT: packuswb %xmm8, %xmm8
+; SSE-NEXT: pandn %xmm8, %xmm7
+; SSE-NEXT: por %xmm9, %xmm7
+; SSE-NEXT: movq %xmm7, (%rdx)
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT: movdqa %xmm8, %xmm7
+; SSE-NEXT: pandn %xmm1, %xmm7
+; SSE-NEXT: movdqa %xmm2, %xmm9
+; SSE-NEXT: pand %xmm8, %xmm9
+; SSE-NEXT: por %xmm7, %xmm9
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm9[2,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
+; SSE-NEXT: pand %xmm3, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,5,5]
; SSE-NEXT: packuswb %xmm10, %xmm10
-; SSE-NEXT: pand %xmm1, %xmm10
-; SSE-NEXT: movdqa %xmm0, %xmm8
-; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; SSE-NEXT: packuswb %xmm9, %xmm9
-; SSE-NEXT: pandn %xmm9, %xmm1
-; SSE-NEXT: por %xmm10, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm11, %xmm9
-; SSE-NEXT: pandn %xmm3, %xmm9
-; SSE-NEXT: movdqa %xmm4, %xmm12
-; SSE-NEXT: pand %xmm11, %xmm12
-; SSE-NEXT: por %xmm9, %xmm12
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
-; SSE-NEXT: pand %xmm5, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5]
-; SSE-NEXT: packuswb %xmm13, %xmm13
-; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm9, %xmm13
-; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6]
-; SSE-NEXT: packuswb %xmm14, %xmm14
+; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm7, %xmm10
+; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6]
+; SSE-NEXT: packuswb %xmm11, %xmm11
+; SSE-NEXT: movdqa %xmm7, %xmm12
+; SSE-NEXT: pandn %xmm11, %xmm12
+; SSE-NEXT: por %xmm10, %xmm12
+; SSE-NEXT: movq %xmm12, (%rcx)
; SSE-NEXT: movdqa %xmm9, %xmm10
-; SSE-NEXT: pandn %xmm14, %xmm10
-; SSE-NEXT: por %xmm13, %xmm10
-; SSE-NEXT: movdqa %xmm12, %xmm13
-; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7]
-; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
-; SSE-NEXT: pand %xmm14, %xmm12
-; SSE-NEXT: pandn %xmm13, %xmm14
-; SSE-NEXT: por %xmm12, %xmm14
-; SSE-NEXT: packuswb %xmm14, %xmm14
-; SSE-NEXT: pand %xmm9, %xmm14
-; SSE-NEXT: movdqa %xmm8, %xmm12
-; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0]
-; SSE-NEXT: movaps %xmm0, %xmm13
-; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2]
-; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2]
-; SSE-NEXT: packuswb %xmm13, %xmm13
-; SSE-NEXT: movdqa %xmm9, %xmm12
-; SSE-NEXT: pandn %xmm13, %xmm12
-; SSE-NEXT: por %xmm14, %xmm12
-; SSE-NEXT: pand %xmm11, %xmm3
-; SSE-NEXT: pandn %xmm4, %xmm11
-; SSE-NEXT: por %xmm3, %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0]
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm4, %xmm4
-; SSE-NEXT: pand %xmm9, %xmm4
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2]
-; SSE-NEXT: packuswb %xmm5, %xmm5
-; SSE-NEXT: movdqa %xmm9, %xmm3
-; SSE-NEXT: pandn %xmm5, %xmm3
-; SSE-NEXT: por %xmm4, %xmm3
-; SSE-NEXT: movdqa %xmm11, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7]
-; SSE-NEXT: pand %xmm5, %xmm6
-; SSE-NEXT: pandn %xmm4, %xmm5
-; SSE-NEXT: por %xmm6, %xmm5
-; SSE-NEXT: packuswb %xmm5, %xmm5
-; SSE-NEXT: pand %xmm9, %xmm5
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7]
+; SSE-NEXT: movdqa {{.*#+}} xmm11 = [0,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7]
+; SSE-NEXT: pand %xmm11, %xmm9
+; SSE-NEXT: pandn %xmm10, %xmm11
+; SSE-NEXT: por %xmm9, %xmm11
+; SSE-NEXT: packuswb %xmm11, %xmm11
+; SSE-NEXT: pand %xmm7, %xmm11
+; SSE-NEXT: movdqa %xmm6, %xmm9
+; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[3,0]
+; SSE-NEXT: movaps %xmm0, %xmm10
+; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm9[0,2]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,2]
+; SSE-NEXT: packuswb %xmm9, %xmm9
+; SSE-NEXT: movdqa %xmm7, %xmm10
+; SSE-NEXT: pandn %xmm9, %xmm10
+; SSE-NEXT: por %xmm11, %xmm10
+; SSE-NEXT: movq %xmm10, (%r8)
+; SSE-NEXT: pand %xmm8, %xmm1
+; SSE-NEXT: pandn %xmm2, %xmm8
+; SSE-NEXT: por %xmm1, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0]
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm1, %xmm1
+; SSE-NEXT: pand %xmm7, %xmm1
+; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; SSE-NEXT: packuswb %xmm2, %xmm2
+; SSE-NEXT: movdqa %xmm7, %xmm3
+; SSE-NEXT: pandn %xmm2, %xmm3
+; SSE-NEXT: por %xmm1, %xmm3
+; SSE-NEXT: movq %xmm3, (%r9)
+; SSE-NEXT: movdqa %xmm8, %xmm1
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,7,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,1,2,4,5,6,7]
+; SSE-NEXT: pand %xmm2, %xmm3
+; SSE-NEXT: pandn %xmm1, %xmm2
+; SSE-NEXT: por %xmm3, %xmm2
+; SSE-NEXT: packuswb %xmm2, %xmm2
+; SSE-NEXT: pand %xmm7, %xmm2
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm6[0,0]
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pandn %xmm0, %xmm9
-; SSE-NEXT: por %xmm5, %xmm9
-; SSE-NEXT: movq %xmm2, (%rsi)
-; SSE-NEXT: movq %xmm1, (%rdx)
-; SSE-NEXT: movq %xmm10, (%rcx)
-; SSE-NEXT: movq %xmm12, (%r8)
-; SSE-NEXT: movq %xmm3, (%r9)
-; SSE-NEXT: movq %xmm9, (%rax)
+; SSE-NEXT: pandn %xmm0, %xmm7
+; SSE-NEXT: por %xmm2, %xmm7
+; SSE-NEXT: movq %xmm7, (%rax)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i8_stride6_vf8:
@@ -910,42 +910,42 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX-NEXT: vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm7
-; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8
-; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8
-; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX-NEXT: vmovq %xmm3, (%rsi)
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vmovq %xmm3, (%rdx)
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vmovq {{.*#+}} xmm4 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX-NEXT: vmovq %xmm3, (%rcx)
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX-NEXT: vmovq %xmm3, (%r8)
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX-NEXT: vmovq %xmm3, (%r9)
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovq %xmm3, (%rsi)
-; AVX-NEXT: vmovq %xmm4, (%rdx)
-; AVX-NEXT: vmovq %xmm5, (%rcx)
-; AVX-NEXT: vmovq %xmm7, (%r8)
-; AVX-NEXT: vmovq %xmm8, (%r9)
; AVX-NEXT: vmovq %xmm0, (%rax)
; AVX-NEXT: retq
;
@@ -959,30 +959,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vmovq %xmm4, (%rsi)
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vmovq %xmm2, (%rdx)
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%r8)
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%r9)
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm4, (%rsi)
-; AVX2-NEXT: vmovq %xmm2, (%rdx)
-; AVX2-NEXT: vmovq %xmm6, (%rcx)
-; AVX2-NEXT: vmovq %xmm3, (%r8)
-; AVX2-NEXT: vmovq %xmm5, (%r9)
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -997,30 +997,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX2-FP-NEXT: vmovq %xmm4, (%rsi)
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX2-FP-NEXT: vmovq %xmm2, (%rdx)
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX2-FP-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT: vmovq %xmm2, (%r8)
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-FP-NEXT: vmovq %xmm2, (%r9)
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-FP-NEXT: vmovq %xmm4, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm2, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm6, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm3, (%r8)
-; AVX2-FP-NEXT: vmovq %xmm5, (%r9)
; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
@@ -1035,30 +1035,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi)
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r8)
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r9)
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%r8)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%r9)
; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
@@ -1073,30 +1073,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512-NEXT: vmovq %xmm4, (%rsi)
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512-NEXT: vmovq %xmm2, (%rdx)
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512-NEXT: vmovq %xmm4, (%rcx)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, (%r8)
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vmovq %xmm2, (%r9)
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm4, (%rsi)
-; AVX512-NEXT: vmovq %xmm2, (%rdx)
-; AVX512-NEXT: vmovq %xmm6, (%rcx)
-; AVX512-NEXT: vmovq %xmm3, (%r8)
-; AVX512-NEXT: vmovq %xmm5, (%r9)
; AVX512-NEXT: vmovq %xmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1111,30 +1111,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi)
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovq %xmm2, (%r8)
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512-FCP-NEXT: vmovq %xmm2, (%r9)
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovq %xmm4, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm6, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm3, (%r8)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%r9)
; AVX512-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -1149,30 +1149,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-NEXT: vmovq %xmm4, (%rsi)
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-NEXT: vmovq %xmm4, (%rcx)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovq %xmm2, (%r8)
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-NEXT: vmovq %xmm2, (%r9)
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovq %xmm4, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm6, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm3, (%r8)
-; AVX512DQ-NEXT: vmovq %xmm5, (%r9)
; AVX512DQ-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1187,30 +1187,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi)
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r8)
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9)
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9)
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -1225,30 +1225,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-NEXT: vmovq %xmm4, (%rsi)
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, (%r8)
; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, (%r9)
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm4, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm6, (%rcx)
-; AVX512BW-NEXT: vmovq %xmm3, (%r8)
-; AVX512BW-NEXT: vmovq %xmm5, (%r9)
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1263,30 +1263,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi)
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx)
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r8)
; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9)
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm6, (%rcx)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9)
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -1301,30 +1301,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi)
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx)
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r8)
; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9)
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm6, (%rcx)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9)
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
@@ -1339,30 +1339,30 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[4,10],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[2,8,14],zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,11],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,9,15],zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r8)
; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9)
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9)
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 8248126..09d0079 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -932,106 +932,100 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
; SSE-LABEL: load_i8_stride7_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm3
+; SSE-NEXT: movdqa (%rdi), %xmm2
; SSE-NEXT: movdqa 16(%rdi), %xmm11
; SSE-NEXT: movdqa 32(%rdi), %xmm6
-; SSE-NEXT: movdqa 48(%rdi), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm11, %xmm2
-; SSE-NEXT: por %xmm1, %xmm2
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm5
-; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
+; SSE-NEXT: movdqa 48(%rdi), %xmm13
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535]
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pandn %xmm11, %xmm4
+; SSE-NEXT: por %xmm1, %xmm4
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: movdqa %xmm4, %xmm5
+; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: pand %xmm7, %xmm2
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE-NEXT: pand %xmm7, %xmm4
; SSE-NEXT: pandn %xmm5, %xmm7
-; SSE-NEXT: por %xmm2, %xmm7
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7]
+; SSE-NEXT: por %xmm4, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,1,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1]
+; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm4[0,3,2,1,4,5,6,7]
; SSE-NEXT: packuswb %xmm7, %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm2, %xmm7
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm4, %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535]
; SSE-NEXT: movdqa %xmm6, %xmm5
; SSE-NEXT: pand %xmm9, %xmm5
-; SSE-NEXT: pandn %xmm0, %xmm9
+; SSE-NEXT: pandn %xmm13, %xmm9
; SSE-NEXT: por %xmm5, %xmm9
-; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
; SSE-NEXT: movdqa %xmm6, %xmm8
-; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3]
+; SSE-NEXT: movss {{.*#+}} xmm8 = xmm13[0],xmm8[1,2,3]
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm11, %xmm10
-; SSE-NEXT: movdqa %xmm11, %xmm1
; SSE-NEXT: pand %xmm5, %xmm10
; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm6, %xmm4
-; SSE-NEXT: pand %xmm12, %xmm4
-; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pandn %xmm0, %xmm12
-; SSE-NEXT: movaps %xmm0, %xmm14
+; SSE-NEXT: movdqa %xmm6, %xmm1
+; SSE-NEXT: pand %xmm12, %xmm1
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pandn %xmm13, %xmm12
+; SSE-NEXT: movaps %xmm13, %xmm14
; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3]
-; SSE-NEXT: pand %xmm5, %xmm0
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: pand %xmm5, %xmm13
; SSE-NEXT: pandn %xmm6, %xmm5
-; SSE-NEXT: movdqa %xmm6, %xmm15
-; SSE-NEXT: pxor %xmm0, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
-; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
+; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,6]
; SSE-NEXT: packuswb %xmm9, %xmm9
-; SSE-NEXT: movdqa %xmm2, %xmm11
-; SSE-NEXT: movdqa %xmm2, %xmm13
-; SSE-NEXT: pandn %xmm9, %xmm13
-; SSE-NEXT: por %xmm7, %xmm13
+; SSE-NEXT: movdqa %xmm4, %xmm15
+; SSE-NEXT: pandn %xmm9, %xmm15
+; SSE-NEXT: por %xmm7, %xmm15
+; SSE-NEXT: movq %xmm15, (%rsi)
; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535]
; SSE-NEXT: movdqa %xmm7, %xmm9
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: pandn %xmm1, %xmm9
-; SSE-NEXT: movdqa %xmm3, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: por %xmm9, %xmm3
-; SSE-NEXT: movdqa %xmm3, %xmm9
-; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE-NEXT: pandn %xmm11, %xmm9
+; SSE-NEXT: movdqa %xmm2, %xmm15
+; SSE-NEXT: pand %xmm7, %xmm15
+; SSE-NEXT: por %xmm9, %xmm15
+; SSE-NEXT: movdqa %xmm15, %xmm9
+; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535]
; SSE-NEXT: movdqa %xmm0, %xmm1
; SSE-NEXT: pandn %xmm9, %xmm1
-; SSE-NEXT: pxor %xmm6, %xmm6
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm3[8],xmm15[9],xmm3[9],xmm15[10],xmm3[10],xmm15[11],xmm3[11],xmm15[12],xmm3[12],xmm15[13],xmm3[13],xmm15[14],xmm3[14],xmm15[15],xmm3[15]
+; SSE-NEXT: pand %xmm0, %xmm15
+; SSE-NEXT: por %xmm1, %xmm15
+; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
; SSE-NEXT: movdqa %xmm8, %xmm9
; SSE-NEXT: pand %xmm1, %xmm9
-; SSE-NEXT: pandn %xmm15, %xmm1
+; SSE-NEXT: pandn %xmm6, %xmm1
; SSE-NEXT: por %xmm9, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm11, %xmm9
+; SSE-NEXT: movdqa %xmm4, %xmm9
; SSE-NEXT: pandn %xmm1, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: pand %xmm11, %xmm1
+; SSE-NEXT: pand %xmm4, %xmm1
; SSE-NEXT: por %xmm1, %xmm9
+; SSE-NEXT: movq %xmm9, (%rdx)
; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; SSE-NEXT: pandn %xmm2, %xmm1
; SSE-NEXT: por %xmm1, %xmm10
; SSE-NEXT: movdqa %xmm10, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3],xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
; SSE-NEXT: pand %xmm0, %xmm10
; SSE-NEXT: pandn %xmm1, %xmm0
; SSE-NEXT: por %xmm10, %xmm0
@@ -1040,107 +1034,104 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pand %xmm11, %xmm0
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
; SSE-NEXT: pand %xmm1, %xmm8
-; SSE-NEXT: pandn %xmm15, %xmm1
+; SSE-NEXT: pandn %xmm6, %xmm1
; SSE-NEXT: por %xmm8, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm11, %xmm8
+; SSE-NEXT: movdqa %xmm4, %xmm8
; SSE-NEXT: pandn %xmm1, %xmm8
; SSE-NEXT: por %xmm0, %xmm8
+; SSE-NEXT: movq %xmm8, (%rcx)
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: movdqa %xmm11, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
; SSE-NEXT: pandn %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm10
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: pand %xmm3, %xmm0
-; SSE-NEXT: pandn %xmm1, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE-NEXT: pand %xmm8, %xmm0
+; SSE-NEXT: pandn %xmm1, %xmm8
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; SSE-NEXT: por %xmm0, %xmm3
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7]
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: por %xmm0, %xmm8
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,2,1,0,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pand %xmm11, %xmm0
+; SSE-NEXT: pand %xmm4, %xmm0
; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7]
-; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6]
-; SSE-NEXT: packuswb %xmm3, %xmm3
-; SSE-NEXT: pandn %xmm3, %xmm11
-; SSE-NEXT: por %xmm0, %xmm11
-; SSE-NEXT: movdqa %xmm11, %xmm6
+; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm12[0,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
+; SSE-NEXT: packuswb %xmm8, %xmm8
+; SSE-NEXT: pandn %xmm8, %xmm4
+; SSE-NEXT: por %xmm0, %xmm4
+; SSE-NEXT: movq %xmm4, (%r8)
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT: movdqa %xmm4, %xmm2
-; SSE-NEXT: movdqa %xmm4, %xmm3
-; SSE-NEXT: pand %xmm0, %xmm3
-; SSE-NEXT: movdqa %xmm10, %xmm11
-; SSE-NEXT: pandn %xmm10, %xmm0
-; SSE-NEXT: por %xmm3, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm4
-; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT: movdqa %xmm11, %xmm4
+; SSE-NEXT: pand %xmm0, %xmm4
+; SSE-NEXT: pandn %xmm2, %xmm0
+; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm4
+; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5]
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535]
-; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pandn %xmm15, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: packuswb %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3]
-; SSE-NEXT: movdqa %xmm11, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535]
+; SSE-NEXT: pand %xmm4, %xmm1
+; SSE-NEXT: pandn %xmm6, %xmm4
+; SSE-NEXT: por %xmm1, %xmm4
+; SSE-NEXT: packuswb %xmm4, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT: movq %xmm0, (%r9)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,2,2,3]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3],xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7]
; SSE-NEXT: pand %xmm0, %xmm14
-; SSE-NEXT: pandn %xmm15, %xmm0
+; SSE-NEXT: pandn %xmm6, %xmm0
; SSE-NEXT: por %xmm14, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
; SSE-NEXT: packuswb %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3]
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pand %xmm7, %xmm0
-; SSE-NEXT: pandn %xmm3, %xmm7
-; SSE-NEXT: por %xmm0, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3]
+; SSE-NEXT: movq %xmm0, (%rcx)
+; SSE-NEXT: pand %xmm7, %xmm11
+; SSE-NEXT: pandn %xmm2, %xmm7
+; SSE-NEXT: por %xmm11, %xmm7
; SSE-NEXT: movdqa %xmm7, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
-; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE-NEXT: por %xmm13, %xmm5
+; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: pandn %xmm15, %xmm1
+; SSE-NEXT: pandn %xmm6, %xmm1
; SSE-NEXT: por %xmm5, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
@@ -1148,12 +1139,6 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; SSE-NEXT: movq %xmm13, (%rsi)
-; SSE-NEXT: movq %xmm9, (%rdx)
-; SSE-NEXT: movq %xmm8, (%rcx)
-; SSE-NEXT: movq %xmm6, (%r8)
-; SSE-NEXT: movq %xmm10, (%r9)
-; SSE-NEXT: movq %xmm11, (%rdi)
; SSE-NEXT: movq %xmm0, (%rax)
; SSE-NEXT: retq
;
@@ -1174,52 +1159,52 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0]
; AVX-NEXT: # xmm7 = mem[0,0]
; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5
-; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
-; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8
-; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm8, %xmm5
-; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8
-; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm9
-; AVX-NEXT: vpblendvb %xmm7, %xmm8, %xmm9, %xmm8
-; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9
-; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
-; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6
-; AVX-NEXT: vpblendvb %xmm7, %xmm9, %xmm6, %xmm6
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7
-; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3],xmm10[4,5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%rsi)
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
+; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
+; AVX-NEXT: vmovq %xmm4, (%rdx)
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
+; AVX-NEXT: vmovq %xmm4, (%rcx)
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
+; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
+; AVX-NEXT: vmovq %xmm4, (%r8)
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX-NEXT: vmovd {{.*#+}} xmm5 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6
+; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%r9)
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4
+; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%r10)
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vmovq %xmm4, (%rsi)
-; AVX-NEXT: vmovq %xmm5, (%rdx)
-; AVX-NEXT: vmovq %xmm8, (%rcx)
-; AVX-NEXT: vmovq %xmm6, (%r8)
-; AVX-NEXT: vmovq %xmm7, (%r9)
-; AVX-NEXT: vmovq %xmm10, (%r10)
; AVX-NEXT: vmovq %xmm0, (%rax)
; AVX-NEXT: retq
;
@@ -1235,45 +1220,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
-; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%rdx)
+; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%r8)
+; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%r9)
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%r10)
+; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-NEXT: vmovq %xmm5, (%r8)
-; AVX2-NEXT: vmovq %xmm6, (%r9)
-; AVX2-NEXT: vmovq %xmm7, (%r10)
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1290,45 +1275,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
-; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
-; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
-; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
-; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
-; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT: vmovq %xmm2, (%rdx)
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
+; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT: vmovq %xmm2, (%r8)
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
+; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT: vmovq %xmm2, (%r9)
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT: vmovq %xmm2, (%r10)
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
+; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm5, (%r8)
-; AVX2-FP-NEXT: vmovq %xmm6, (%r9)
-; AVX2-FP-NEXT: vmovq %xmm7, (%r10)
; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
@@ -1345,45 +1330,45 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
-; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
+; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r8)
+; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r10)
+; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
+; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%r8)
-; AVX2-FCP-NEXT: vmovq %xmm6, (%r9)
-; AVX2-FCP-NEXT: vmovq %xmm7, (%r10)
; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
@@ -1400,44 +1385,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
-; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
-; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512-NEXT: vmovq %xmm2, (%rsi)
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, (%rdx)
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, (%rcx)
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, (%r8)
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, (%r9)
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovq %xmm2, (%r10)
; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-NEXT: vmovq %xmm3, (%rdx)
-; AVX512-NEXT: vmovq %xmm4, (%rcx)
-; AVX512-NEXT: vmovq %xmm5, (%r8)
-; AVX512-NEXT: vmovq %xmm6, (%r9)
-; AVX512-NEXT: vmovq %xmm7, (%r10)
; AVX512-NEXT: vmovq %xmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1454,44 +1439,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rcx)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovq %xmm2, (%r8)
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovq %xmm2, (%r10)
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
-; AVX512-FCP-NEXT: vmovq %xmm6, (%r9)
-; AVX512-FCP-NEXT: vmovq %xmm7, (%r10)
; AVX512-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -1508,44 +1493,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovq %xmm2, (%rcx)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovq %xmm2, (%r8)
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovq %xmm2, (%r10)
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm5, (%r8)
-; AVX512DQ-NEXT: vmovq %xmm6, (%r9)
-; AVX512DQ-NEXT: vmovq %xmm7, (%r10)
; AVX512DQ-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1562,44 +1547,44 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm1 ^ (ymm4 & (ymm0 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm0 ^ (ymm6 & (ymm1 ^ ymm0))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rcx)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm0 ^ (ymm2 & (ymm1 ^ ymm0))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r10)
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
-; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9)
-; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10)
; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -1617,48 +1602,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: movw $580, %di # imm = 0x244
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
-; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
-; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
-; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
-; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448
-; AVX512BW-NEXT: kmovd %edi, %k1
+; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512BW-NEXT: movw $580, %dx # imm = 0x244
+; AVX512BW-NEXT: kmovd %edx, %k1
+; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, (%rcx)
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, (%r8)
+; AVX512BW-NEXT: movw $4644, %cx # imm = 0x1224
+; AVX512BW-NEXT: kmovd %ecx, %k1
+; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, (%r9)
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovq %xmm2, (%r10)
+; AVX512BW-NEXT: movw $9288, %cx # imm = 0x2448
+; AVX512BW-NEXT: kmovd %ecx, %k1
; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
-; AVX512BW-NEXT: vmovq %xmm5, (%r8)
-; AVX512BW-NEXT: vmovq %xmm6, (%r9)
-; AVX512BW-NEXT: vmovq %xmm7, (%r10)
; AVX512BW-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1676,48 +1661,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512BW-FCP-NEXT: movw $580, %di # imm = 0x244
-; AVX512BW-FCP-NEXT: kmovd %edi, %k1
-; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224
-; AVX512BW-FCP-NEXT: kmovd %edi, %k1
-; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448
-; AVX512BW-FCP-NEXT: kmovd %edi, %k1
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT: movw $580, %dx # imm = 0x244
+; AVX512BW-FCP-NEXT: kmovd %edx, %k1
+; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rcx)
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r8)
+; AVX512BW-FCP-NEXT: movw $4644, %cx # imm = 0x1224
+; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
+; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%r10)
+; AVX512BW-FCP-NEXT: movw $9288, %cx # imm = 0x2448
+; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9)
-; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10)
; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -1735,48 +1720,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-BW-NEXT: movw $580, %di # imm = 0x244
-; AVX512DQ-BW-NEXT: kmovd %edi, %k1
-; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224
-; AVX512DQ-BW-NEXT: kmovd %edi, %k1
-; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448
-; AVX512DQ-BW-NEXT: kmovd %edi, %k1
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-NEXT: movw $580, %dx # imm = 0x244
+; AVX512DQ-BW-NEXT: kmovd %edx, %k1
+; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx)
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r8)
+; AVX512DQ-BW-NEXT: movw $4644, %cx # imm = 0x1224
+; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
+; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%r10)
+; AVX512DQ-BW-NEXT: movw $9288, %cx # imm = 0x2448
+; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9)
-; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10)
; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
@@ -1794,48 +1779,48 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-BW-FCP-NEXT: movw $580, %di # imm = 0x244
-; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224
-; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448
-; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: movw $580, %dx # imm = 0x244
+; AVX512DQ-BW-FCP-NEXT: kmovd %edx, %k1
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r8)
+; AVX512DQ-BW-FCP-NEXT: movw $4644, %cx # imm = 0x1224
+; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%r10)
+; AVX512DQ-BW-FCP-NEXT: movw $9288, %cx # imm = 0x2448
+; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10)
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 6770fb6..deb74d2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -878,212 +878,205 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
; SSE-LABEL: load_i8_stride8_vf8:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: movdqa (%rdi), %xmm12
+; SSE-NEXT: movdqa (%rdi), %xmm9
; SSE-NEXT: movdqa 16(%rdi), %xmm11
-; SSE-NEXT: movdqa 32(%rdi), %xmm9
-; SSE-NEXT: movdqa 48(%rdi), %xmm10
+; SSE-NEXT: movdqa 32(%rdi), %xmm13
+; SSE-NEXT: movdqa 48(%rdi), %xmm7
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0]
; SSE-NEXT: movdqa %xmm11, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm12, %xmm2
+; SSE-NEXT: movdqa %xmm9, %xmm2
; SSE-NEXT: pand %xmm0, %xmm2
; SSE-NEXT: packuswb %xmm1, %xmm2
; SSE-NEXT: packuswb %xmm2, %xmm2
-; SSE-NEXT: movdqa %xmm10, %xmm1
+; SSE-NEXT: movdqa %xmm7, %xmm1
; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pand %xmm9, %xmm0
+; SSE-NEXT: pand %xmm13, %xmm0
; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: packuswb %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: pxor %xmm7, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
+; SSE-NEXT: movq %xmm0, (%rsi)
+; SSE-NEXT: pxor %xmm5, %xmm5
; SSE-NEXT: movdqa %xmm11, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm11, %xmm14
-; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3],xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
-; SSE-NEXT: movdqa %xmm14, %xmm15
-; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; SSE-NEXT: packuswb %xmm15, %xmm15
+; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7]
+; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE-NEXT: packuswb %xmm14, %xmm14
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: pandn %xmm15, %xmm0
-; SSE-NEXT: movdqa %xmm12, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm12, %xmm5
-; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
-; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm10, %xmm0
-; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
-; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm10, %xmm13
-; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3],xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
-; SSE-NEXT: movdqa %xmm13, %xmm4
-; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE-NEXT: packuswb %xmm4, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,3]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT: movdqa %xmm2, %xmm6
-; SSE-NEXT: pandn %xmm8, %xmm6
-; SSE-NEXT: movdqa %xmm9, %xmm8
-; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm14, %xmm2
; SSE-NEXT: movdqa %xmm9, %xmm0
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa %xmm0, %xmm7
-; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,1,3]
-; SSE-NEXT: packuswb %xmm8, %xmm8
-; SSE-NEXT: pand %xmm2, %xmm8
-; SSE-NEXT: por %xmm6, %xmm8
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm8[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: pand %xmm6, %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm8, %xmm8
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pandn %xmm8, %xmm1
-; SSE-NEXT: pand %xmm6, %xmm12
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm12[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm8, %xmm8
-; SSE-NEXT: pand %xmm3, %xmm8
-; SSE-NEXT: por %xmm1, %xmm8
-; SSE-NEXT: pand %xmm6, %xmm10
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,5]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm1, %xmm0
-; SSE-NEXT: pand %xmm6, %xmm9
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,2,0]
-; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,3]
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[3,3,3,3]
-; SSE-NEXT: packuswb %xmm15, %xmm15
-; SSE-NEXT: pand %xmm3, %xmm15
-; SSE-NEXT: por %xmm1, %xmm15
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm4, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,3,3]
-; SSE-NEXT: packuswb %xmm1, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7]
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm5[2,0,2,3,4,5,6,7]
-; SSE-NEXT: packuswb %xmm11, %xmm11
-; SSE-NEXT: pand %xmm3, %xmm11
-; SSE-NEXT: por %xmm1, %xmm11
-; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,4,6]
+; SSE-NEXT: movdqa %xmm9, %xmm15
+; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
+; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm7, %xmm1
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm7, %xmm8
+; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; SSE-NEXT: movdqa %xmm8, %xmm6
+; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; SSE-NEXT: packuswb %xmm6, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,3]
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: pandn %xmm10, %xmm4
+; SSE-NEXT: movdqa %xmm13, %xmm1
+; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
+; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT: movdqa %xmm13, %xmm10
+; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
+; SSE-NEXT: movdqa %xmm10, %xmm5
+; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[0,1,1,3]
+; SSE-NEXT: packuswb %xmm12, %xmm12
+; SSE-NEXT: pand %xmm2, %xmm12
+; SSE-NEXT: por %xmm4, %xmm12
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE-NEXT: movq %xmm0, (%rdx)
+; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm0, %xmm11
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm4, %xmm4
+; SSE-NEXT: movdqa %xmm3, %xmm12
+; SSE-NEXT: pandn %xmm4, %xmm12
+; SSE-NEXT: pand %xmm0, %xmm9
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm4, %xmm4
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: por %xmm12, %xmm4
+; SSE-NEXT: pand %xmm0, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,7,5]
+; SSE-NEXT: packuswb %xmm12, %xmm12
; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,1,1,3]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,4,6,6,7]
+; SSE-NEXT: pandn %xmm12, %xmm1
+; SSE-NEXT: pand %xmm0, %xmm13
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,2,0]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
-; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; SSE-NEXT: packuswb %xmm14, %xmm14
-; SSE-NEXT: movdqa %xmm3, %xmm0
-; SSE-NEXT: pandn %xmm14, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,1,1]
-; SSE-NEXT: packuswb %xmm9, %xmm9
-; SSE-NEXT: pand %xmm3, %xmm9
-; SSE-NEXT: por %xmm0, %xmm9
-; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
-; SSE-NEXT: packuswb %xmm13, %xmm13
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,3]
-; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
+; SSE-NEXT: movq %xmm4, (%rcx)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3]
+; SSE-NEXT: movdqa %xmm3, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,1,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,3,1,4,5,6,7]
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[3,1,2,3,4,5,6,7]
+; SSE-NEXT: movdqa %xmm2, %xmm1
+; SSE-NEXT: pandn %xmm6, %xmm1
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,2,3,3]
; SSE-NEXT: packuswb %xmm4, %xmm4
-; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: pand %xmm2, %xmm4
; SSE-NEXT: por %xmm1, %xmm4
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,5,5,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE-NEXT: movq %xmm0, (%r8)
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,2,0,4,5,6,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: pandn %xmm0, %xmm1
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,5,7,6,7]
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pandn %xmm0, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[3,1,2,3]
+; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,0,2,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm6, %xmm6
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: por %xmm4, %xmm6
+; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5,4,6]
+; SSE-NEXT: packuswb %xmm0, %xmm0
+; SSE-NEXT: movdqa %xmm2, %xmm4
+; SSE-NEXT: pandn %xmm0, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,1,1,3]
+; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT: packuswb %xmm7, %xmm7
+; SSE-NEXT: pand %xmm2, %xmm7
+; SSE-NEXT: por %xmm4, %xmm7
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE-NEXT: movq %xmm6, (%r9)
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7]
+; SSE-NEXT: packuswb %xmm11, %xmm11
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pandn %xmm11, %xmm4
+; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[1,1,1,1]
+; SSE-NEXT: packuswb %xmm6, %xmm6
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: por %xmm4, %xmm6
+; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
+; SSE-NEXT: packuswb %xmm8, %xmm8
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,2,3]
+; SSE-NEXT: movdqa %xmm2, %xmm7
+; SSE-NEXT: pandn %xmm4, %xmm7
+; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
+; SSE-NEXT: packuswb %xmm4, %xmm4
+; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: por %xmm7, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,1,1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq %xmm6, (%rax)
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; SSE-NEXT: packuswb %xmm1, %xmm1
+; SSE-NEXT: movdqa %xmm3, %xmm4
+; SSE-NEXT: pandn %xmm1, %xmm4
+; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,1,2,3,4,5,6,7]
+; SSE-NEXT: packuswb %xmm1, %xmm1
+; SSE-NEXT: pand %xmm3, %xmm1
+; SSE-NEXT: por %xmm4, %xmm1
+; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,7]
+; SSE-NEXT: packuswb %xmm4, %xmm4
+; SSE-NEXT: movdqa %xmm2, %xmm5
+; SSE-NEXT: pandn %xmm4, %xmm5
+; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
+; SSE-NEXT: por %xmm5, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq %xmm1, (%rax)
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,2,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,3,3,3]
; SSE-NEXT: packuswb %xmm1, %xmm1
; SSE-NEXT: pand %xmm3, %xmm1
; SSE-NEXT: pandn %xmm0, %xmm3
; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
; SSE-NEXT: packuswb %xmm0, %xmm0
; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pandn %xmm13, %xmm2
+; SSE-NEXT: pandn %xmm8, %xmm2
; SSE-NEXT: por %xmm0, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT: # xmm0 = mem[0,3,2,3]
-; SSE-NEXT: movq %xmm0, (%rsi)
-; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT: movlps %xmm0, (%rdx)
-; SSE-NEXT: movq %xmm8, (%rcx)
-; SSE-NEXT: movq %xmm15, (%r8)
-; SSE-NEXT: movq %xmm11, (%r9)
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq %xmm9, (%rax)
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq %xmm4, (%rax)
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq %xmm3, (%rax)
-; SSE-NEXT: popq %rax
; SSE-NEXT: retq
;
; AVX-LABEL: load_i8_stride8_vf8:
@@ -1104,76 +1097,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
-; AVX-NEXT: vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm5
+; AVX-NEXT: vmovq %xmm4, (%rsi)
+; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT: vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX-NEXT: vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm6
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
-; AVX-NEXT: vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX-NEXT: vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
-; AVX-NEXT: vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm8
-; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm9
-; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm8
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
-; AVX-NEXT: vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm9
-; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm8
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm10
-; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm9
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
-; AVX-NEXT: vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm9
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX-NEXT: vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm11
-; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm10
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
-; AVX-NEXT: vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm11
-; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm10
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX-NEXT: vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm12
-; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm11
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
-; AVX-NEXT: vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm3
-; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%rdx)
+; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT: vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%rcx)
+; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT: vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%r8)
+; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT: vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%r9)
+; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%r11)
+; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovq %xmm4, (%r10)
+; AVX-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vmovq %xmm4, (%rsi)
-; AVX-NEXT: vmovq %xmm5, (%rdx)
-; AVX-NEXT: vmovq %xmm6, (%rcx)
-; AVX-NEXT: vmovq %xmm7, (%r8)
-; AVX-NEXT: vmovq %xmm8, (%r9)
-; AVX-NEXT: vmovq %xmm9, (%r11)
-; AVX-NEXT: vmovq %xmm10, (%r10)
; AVX-NEXT: vmovq %xmm0, (%rax)
; AVX-NEXT: retq
;
@@ -1195,76 +1188,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT: vmovq %xmm4, (%rsi)
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8
-; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm9
-; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm8
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm9
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm8
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm10
-; AVX2-NEXT: vpshufb %xmm9, %xmm0, %xmm9
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-NEXT: vpshufb %xmm10, %xmm1, %xmm11
-; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm10
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm10
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-NEXT: vpshufb %xmm11, %xmm1, %xmm12
-; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm11
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-NEXT: vpshufb %xmm11, %xmm3, %xmm3
-; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT: vmovq %xmm4, (%r8)
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT: vmovq %xmm4, (%r9)
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT: vmovq %xmm4, (%r11)
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-NEXT: vmovq %xmm4, (%r10)
+; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-NEXT: vmovq %xmm4, (%rsi)
-; AVX2-NEXT: vmovq %xmm5, (%rdx)
-; AVX2-NEXT: vmovq %xmm6, (%rcx)
-; AVX2-NEXT: vmovq %xmm7, (%r8)
-; AVX2-NEXT: vmovq %xmm8, (%r9)
-; AVX2-NEXT: vmovq %xmm9, (%r11)
-; AVX2-NEXT: vmovq %xmm10, (%r10)
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: retq
;
@@ -1286,76 +1279,76 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vmovq %xmm4, (%rsi)
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm7 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm8
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm7
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm9
-; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm8
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm9
-; AVX2-FP-NEXT: vpshufb %xmm8, %xmm2, %xmm8
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm10
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm9
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm9 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm9
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm11
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm10
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm10
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX2-FP-NEXT: vpshufb %xmm11, %xmm1, %xmm12
-; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm11
-; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm11 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vmovq %xmm4, (%r8)
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vmovq %xmm4, (%r9)
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vmovq %xmm4, (%r11)
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm5 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vmovq %xmm4, (%r10)
+; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FP-NEXT: vmovq %xmm4, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm5, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm6, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm7, (%r8)
-; AVX2-FP-NEXT: vmovq %xmm8, (%r9)
-; AVX2-FP-NEXT: vmovq %xmm9, (%r11)
-; AVX2-FP-NEXT: vmovq %xmm10, (%r10)
; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FP-NEXT: retq
;
@@ -1364,54 +1357,54 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3
; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
-; AVX2-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm6
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm7 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm0
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm6 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi)
; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm8
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm9 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm10
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm11
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3]
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm14 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,5,7,5,7,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
-; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm6
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
-; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
-; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm8, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm11, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm3, (%r8)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%r9)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%r11)
-; AVX2-FCP-NEXT: vmovq %xmm6, (%r10)
-; AVX2-FCP-NEXT: vmovq %xmm1, (%rax)
+; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm7
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm8 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx)
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm9
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm10 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm11
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm9, (%rcx)
+; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm11 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r8)
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm3
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm2
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r11)
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r10)
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
@@ -1421,21 +1414,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpsrlq $8, %zmm0, %zmm1
-; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm2
-; AVX512-NEXT: vpsrlq $24, %zmm0, %zmm3
-; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm4
-; AVX512-NEXT: vpsrlq $40, %zmm0, %zmm5
-; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm6
-; AVX512-NEXT: vpsrlq $56, %zmm0, %zmm7
; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT: vpsrlq $8, %zmm0, %zmm1
; AVX512-NEXT: vpmovqb %zmm1, (%rdx)
-; AVX512-NEXT: vpmovqb %zmm2, (%rcx)
-; AVX512-NEXT: vpmovqb %zmm3, (%r8)
-; AVX512-NEXT: vpmovqb %zmm4, (%r9)
-; AVX512-NEXT: vpmovqb %zmm5, (%r11)
-; AVX512-NEXT: vpmovqb %zmm6, (%r10)
-; AVX512-NEXT: vpmovqb %zmm7, (%rax)
+; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm1
+; AVX512-NEXT: vpmovqb %zmm1, (%rcx)
+; AVX512-NEXT: vpsrlq $24, %zmm0, %zmm1
+; AVX512-NEXT: vpmovqb %zmm1, (%r8)
+; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX512-NEXT: vpmovqb %zmm1, (%r9)
+; AVX512-NEXT: vpsrlq $40, %zmm0, %zmm1
+; AVX512-NEXT: vpmovqb %zmm1, (%r11)
+; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm1
+; AVX512-NEXT: vpmovqb %zmm1, (%r10)
+; AVX512-NEXT: vpsrlq $56, %zmm0, %zmm0
+; AVX512-NEXT: vpmovqb %zmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -1445,21 +1438,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4
-; AVX512-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6
-; AVX512-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7
; AVX512-FCP-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1
; AVX512-FCP-NEXT: vpmovqb %zmm1, (%rdx)
-; AVX512-FCP-NEXT: vpmovqb %zmm2, (%rcx)
-; AVX512-FCP-NEXT: vpmovqb %zmm3, (%r8)
-; AVX512-FCP-NEXT: vpmovqb %zmm4, (%r9)
-; AVX512-FCP-NEXT: vpmovqb %zmm5, (%r11)
-; AVX512-FCP-NEXT: vpmovqb %zmm6, (%r10)
-; AVX512-FCP-NEXT: vpmovqb %zmm7, (%rax)
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vpmovqb %zmm1, (%rcx)
+; AVX512-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r8)
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r9)
+; AVX512-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r11)
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vpmovqb %zmm1, (%r10)
+; AVX512-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpmovqb %zmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -1469,21 +1462,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1
-; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm2
-; AVX512DQ-NEXT: vpsrlq $24, %zmm0, %zmm3
-; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm4
-; AVX512DQ-NEXT: vpsrlq $40, %zmm0, %zmm5
-; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm6
-; AVX512DQ-NEXT: vpsrlq $56, %zmm0, %zmm7
; AVX512DQ-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512DQ-NEXT: vpsrlq $8, %zmm0, %zmm1
; AVX512DQ-NEXT: vpmovqb %zmm1, (%rdx)
-; AVX512DQ-NEXT: vpmovqb %zmm2, (%rcx)
-; AVX512DQ-NEXT: vpmovqb %zmm3, (%r8)
-; AVX512DQ-NEXT: vpmovqb %zmm4, (%r9)
-; AVX512DQ-NEXT: vpmovqb %zmm5, (%r11)
-; AVX512DQ-NEXT: vpmovqb %zmm6, (%r10)
-; AVX512DQ-NEXT: vpmovqb %zmm7, (%rax)
+; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm1
+; AVX512DQ-NEXT: vpmovqb %zmm1, (%rcx)
+; AVX512DQ-NEXT: vpsrlq $24, %zmm0, %zmm1
+; AVX512DQ-NEXT: vpmovqb %zmm1, (%r8)
+; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX512DQ-NEXT: vpmovqb %zmm1, (%r9)
+; AVX512DQ-NEXT: vpsrlq $40, %zmm0, %zmm1
+; AVX512DQ-NEXT: vpmovqb %zmm1, (%r11)
+; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm1
+; AVX512DQ-NEXT: vpmovqb %zmm1, (%r10)
+; AVX512DQ-NEXT: vpsrlq $56, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovqb %zmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -1493,21 +1486,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7
; AVX512DQ-FCP-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1
; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, (%rcx)
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, (%r8)
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, (%r9)
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, (%r11)
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, (%r10)
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm7, (%rax)
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%rcx)
+; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r8)
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r9)
+; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r11)
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, (%r10)
+; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -1517,21 +1510,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1
-; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm2
-; AVX512BW-NEXT: vpsrlq $24, %zmm0, %zmm3
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm4
-; AVX512BW-NEXT: vpsrlq $40, %zmm0, %zmm5
-; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm6
-; AVX512BW-NEXT: vpsrlq $56, %zmm0, %zmm7
; AVX512BW-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BW-NEXT: vpsrlq $8, %zmm0, %zmm1
; AVX512BW-NEXT: vpmovqb %zmm1, (%rdx)
-; AVX512BW-NEXT: vpmovqb %zmm2, (%rcx)
-; AVX512BW-NEXT: vpmovqb %zmm3, (%r8)
-; AVX512BW-NEXT: vpmovqb %zmm4, (%r9)
-; AVX512BW-NEXT: vpmovqb %zmm5, (%r11)
-; AVX512BW-NEXT: vpmovqb %zmm6, (%r10)
-; AVX512BW-NEXT: vpmovqb %zmm7, (%rax)
+; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm1
+; AVX512BW-NEXT: vpmovqb %zmm1, (%rcx)
+; AVX512BW-NEXT: vpsrlq $24, %zmm0, %zmm1
+; AVX512BW-NEXT: vpmovqb %zmm1, (%r8)
+; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX512BW-NEXT: vpmovqb %zmm1, (%r9)
+; AVX512BW-NEXT: vpsrlq $40, %zmm0, %zmm1
+; AVX512BW-NEXT: vpmovqb %zmm1, (%r11)
+; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm1
+; AVX512BW-NEXT: vpmovqb %zmm1, (%r10)
+; AVX512BW-NEXT: vpsrlq $56, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovqb %zmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -1541,21 +1534,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2
-; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5
-; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6
-; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7
; AVX512BW-FCP-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1
; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%rdx)
-; AVX512BW-FCP-NEXT: vpmovqb %zmm2, (%rcx)
-; AVX512BW-FCP-NEXT: vpmovqb %zmm3, (%r8)
-; AVX512BW-FCP-NEXT: vpmovqb %zmm4, (%r9)
-; AVX512BW-FCP-NEXT: vpmovqb %zmm5, (%r11)
-; AVX512BW-FCP-NEXT: vpmovqb %zmm6, (%r10)
-; AVX512BW-FCP-NEXT: vpmovqb %zmm7, (%rax)
+; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%rcx)
+; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r8)
+; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r9)
+; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r11)
+; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vpmovqb %zmm1, (%r10)
+; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vpmovqb %zmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -1565,21 +1558,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm0, %zmm1
-; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm0, %zmm3
-; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm4
-; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm0, %zmm5
-; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm6
-; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm0, %zmm7
; AVX512DQ-BW-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm0, %zmm1
; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%rdx)
-; AVX512DQ-BW-NEXT: vpmovqb %zmm2, (%rcx)
-; AVX512DQ-BW-NEXT: vpmovqb %zmm3, (%r8)
-; AVX512DQ-BW-NEXT: vpmovqb %zmm4, (%r9)
-; AVX512DQ-BW-NEXT: vpmovqb %zmm5, (%r11)
-; AVX512DQ-BW-NEXT: vpmovqb %zmm6, (%r10)
-; AVX512DQ-BW-NEXT: vpmovqb %zmm7, (%rax)
+; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%rcx)
+; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r8)
+; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r9)
+; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r11)
+; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vpmovqb %zmm1, (%r10)
+; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vpmovqb %zmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -1589,21 +1582,21 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm7
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm0, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm5, (%r11)
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm6, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r11)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm1, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <64 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll
index d0bb90c..552b927 100644
--- a/llvm/test/CodeGen/X86/vector-mul.ll
+++ b/llvm/test/CodeGen/X86/vector-mul.ll
@@ -250,7 +250,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,1,2,4,8]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,u,2,u,4,u,8,u,1,u,2,u,4,u,8,u]
; SSE2-NEXT: pmullw %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm1
@@ -265,7 +265,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
; X86-SSE4-NEXT: movdqa %xmm0, %xmm1
; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
; X86-SSE4-NEXT: psllw $8, %xmm1
-; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
+; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: por %xmm1, %xmm0
; X86-SSE4-NEXT: retl
@@ -275,7 +275,7 @@ define <16 x i8> @mul_v16i8_1_2_4_8_1_2_4_8_1_2_4_8_1_2_4_8(<16 x i8> %a0) nounw
; X64-SSE4-NEXT: movdqa %xmm0, %xmm1
; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,2,0,8,0,2,0,8,0,2,0,8]
; X64-SSE4-NEXT: psllw $8, %xmm1
-; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,1,0,4,0,1,0,4,0,1,0,4,0]
+; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE4-NEXT: por %xmm1, %xmm0
; X64-SSE4-NEXT: retq
@@ -1058,11 +1058,11 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movdqa %xmm0, %xmm1
; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3,9,17,33,65,129,2,3]
+; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [3,u,9,u,17,u,33,u,65,u,129,u,2,u,3,u]
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; X86-SSE2-NEXT: pand %xmm2, %xmm1
; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,2]
+; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,u,3,u,9,u,17,u,33,u,65,u,129,u,2,u]
; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: packuswb %xmm1, %xmm0
; X86-SSE2-NEXT: retl
@@ -1072,7 +1072,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
; X86-SSE4-NEXT: movdqa %xmm0, %xmm1
; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3]
; X86-SSE4-NEXT: psllw $8, %xmm1
-; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0]
+; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: por %xmm1, %xmm0
; X86-SSE4-NEXT: retl
@@ -1081,11 +1081,11 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movdqa %xmm0, %xmm1
; X64-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3,9,17,33,65,129,2,3]
+; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [3,u,9,u,17,u,33,u,65,u,129,u,2,u,3,u]
; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; X64-SSE2-NEXT: pand %xmm2, %xmm1
; X64-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,2]
+; X64-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,u,3,u,9,u,17,u,33,u,65,u,129,u,2,u]
; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: packuswb %xmm1, %xmm0
; X64-SSE2-NEXT: retq
@@ -1095,7 +1095,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
; X64-SSE4-NEXT: movdqa %xmm0, %xmm1
; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3]
; X64-SSE4-NEXT: psllw $8, %xmm1
-; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0]
+; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE4-NEXT: por %xmm1, %xmm0
; X64-SSE4-NEXT: retq
@@ -1103,7 +1103,7 @@ define <16 x i8> @mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3(<16 x i8>
; X64-XOP-LABEL: mul_v16i8_2_3_9_17_33_65_129_2_3_9_17_33_65_129_2_3:
; X64-XOP: # %bb.0:
; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,3,0,17,0,65,0,2,0,9,0,33,0,129,0,3]
-; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,0,9,0,33,0,129,0,3,0,17,0,65,0,2,0]
+; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,3,9,17,33,65,129,2,3,9,17,33,65,129,2,3]
; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14]
; X64-XOP-NEXT: retq
;
@@ -1832,7 +1832,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8>
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,1,3,7,15,31,63,127]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,u,1,u,3,u,7,u,15,u,31,u,63,u,127,u]
; SSE2-NEXT: pmullw %xmm2, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm1
@@ -1847,7 +1847,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8>
; X86-SSE4-NEXT: movdqa %xmm0, %xmm1
; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127]
; X86-SSE4-NEXT: psllw $8, %xmm1
-; X86-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0]
+; X86-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
; X86-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE4-NEXT: por %xmm1, %xmm0
; X86-SSE4-NEXT: retl
@@ -1857,7 +1857,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8>
; X64-SSE4-NEXT: movdqa %xmm0, %xmm1
; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127]
; X64-SSE4-NEXT: psllw $8, %xmm1
-; X64-SSE4-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0]
+; X64-SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
; X64-SSE4-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-SSE4-NEXT: por %xmm1, %xmm0
; X64-SSE4-NEXT: retq
@@ -1865,7 +1865,7 @@ define <16 x i8> @mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127(<16 x i8>
; X64-XOP-LABEL: mul_v16i8_0_1_3_7_15_31_63_127_0_1_3_7_15_31_63_127:
; X64-XOP: # %bb.0:
; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,1,0,7,0,31,0,127,0,1,0,7,0,31,0,127]
-; X64-XOP-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,3,0,15,0,63,0,0,0,3,0,15,0,63,0]
+; X64-XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,1,3,7,15,31,63,127,0,1,3,7,15,31,63,127]
; X64-XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14]
; X64-XOP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index b233855..324fe12 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -85,14 +85,14 @@ define <4 x i16> @smulfixsat(<4 x i16> %a) {
; CHECK-NEXT: movswl %dx, %edx
; CHECK-NEXT: leal (,%rdx,4), %esi
; CHECK-NEXT: movl %esi, %edi
-; CHECK-NEXT: shrl $16, %edi
-; CHECK-NEXT: shldw $1, %si, %di
+; CHECK-NEXT: shrl $16, %esi
+; CHECK-NEXT: shldw $1, %di, %si
; CHECK-NEXT: sarl $14, %edx
; CHECK-NEXT: cmpl $16384, %edx # imm = 0x4000
-; CHECK-NEXT: cmovgel %eax, %edi
+; CHECK-NEXT: cmovgel %eax, %esi
; CHECK-NEXT: cmpl $-16384, %edx # imm = 0xC000
-; CHECK-NEXT: cmovll %ecx, %edi
-; CHECK-NEXT: pinsrw $3, %edi, %xmm1
+; CHECK-NEXT: cmovll %ecx, %esi
+; CHECK-NEXT: pinsrw $3, %esi, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
; CHECK-NEXT: retq
%t = call <4 x i16> @llvm.smul.fix.sat.v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> %a, i32 15)
@@ -106,19 +106,19 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) {
; CHECK-NEXT: pextrw $2, %xmm0, %eax
; CHECK-NEXT: leal (%rax,%rax,2), %eax
; CHECK-NEXT: movl %eax, %edx
-; CHECK-NEXT: shrl $16, %edx
-; CHECK-NEXT: movl %edx, %ecx
-; CHECK-NEXT: shldw $1, %ax, %cx
-; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000
+; CHECK-NEXT: shrl $16, %eax
+; CHECK-NEXT: movl %eax, %ecx
+; CHECK-NEXT: shldw $1, %dx, %cx
+; CHECK-NEXT: cmpl $32768, %eax # imm = 0x8000
; CHECK-NEXT: movl $65535, %eax # imm = 0xFFFF
; CHECK-NEXT: cmovael %eax, %ecx
; CHECK-NEXT: pextrw $1, %xmm0, %edx
; CHECK-NEXT: addl %edx, %edx
; CHECK-NEXT: movl %edx, %esi
-; CHECK-NEXT: shrl $16, %esi
-; CHECK-NEXT: movl %esi, %edi
-; CHECK-NEXT: shldw $1, %dx, %di
-; CHECK-NEXT: cmpl $32768, %esi # imm = 0x8000
+; CHECK-NEXT: shrl $16, %edx
+; CHECK-NEXT: movl %edx, %edi
+; CHECK-NEXT: shldw $1, %si, %di
+; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000
; CHECK-NEXT: cmovael %eax, %edi
; CHECK-NEXT: movd %xmm0, %edx
; CHECK-NEXT: xorl %esi, %esi
@@ -133,10 +133,10 @@ define <4 x i16> @umulfixsat(<4 x i16> %a) {
; CHECK-NEXT: pextrw $3, %xmm0, %ecx
; CHECK-NEXT: shll $2, %ecx
; CHECK-NEXT: movl %ecx, %edx
-; CHECK-NEXT: shrl $16, %edx
-; CHECK-NEXT: movl %edx, %esi
-; CHECK-NEXT: shldw $1, %cx, %si
-; CHECK-NEXT: cmpl $32768, %edx # imm = 0x8000
+; CHECK-NEXT: shrl $16, %ecx
+; CHECK-NEXT: movl %ecx, %esi
+; CHECK-NEXT: shldw $1, %dx, %si
+; CHECK-NEXT: cmpl $32768, %ecx # imm = 0x8000
; CHECK-NEXT: cmovael %eax, %esi
; CHECK-NEXT: pinsrw $3, %esi, %xmm1
; CHECK-NEXT: movdqa %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 9816fa7..044327d 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -875,28 +875,12 @@ define i1 @mask_v8i32(<8 x i32> %a0) {
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
-; AVX1-LABEL: mask_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
-; AVX1-NEXT: sete %al
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: mask_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX2-NEXT: vptest %ymm1, %ymm0
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: mask_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX512-NEXT: vptest %ymm1, %ymm0
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: mask_v8i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vtestps %ymm0, %ymm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
%2 = and i32 %1, 2147483648
%3 = icmp eq i32 %2, 0
@@ -965,28 +949,12 @@ define i1 @signtest_v8i32(<8 x i32> %a0) {
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
-; AVX1-LABEL: signtest_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
-; AVX1-NEXT: sete %al
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: signtest_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX2-NEXT: vptest %ymm1, %ymm0
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: signtest_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456]
-; AVX512-NEXT: vptest %ymm1, %ymm0
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: signtest_v8i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vtestps %ymm0, %ymm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0)
%2 = icmp sgt i32 %1, -1
ret i1 %2
@@ -1010,28 +978,12 @@ define i1 @signtest_v4i64(<4 x i64> %a0) {
; SSE41-NEXT: sete %al
; SSE41-NEXT: retq
;
-; AVX1-LABEL: signtest_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0
-; AVX1-NEXT: sete %al
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: signtest_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vptest %ymm1, %ymm0
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: signtest_v4i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT: vptest %ymm1, %ymm0
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: signtest_v4i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vtestpd %ymm0, %ymm0
+; AVX-NEXT: sete %al
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0)
%2 = icmp sgt i64 %1, -1
ret i1 %2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 320dce8..6cb4323 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -397,8 +397,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind {
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl $8, %ecx
-; AVX512F-NEXT: xorb %al, %cl
+; AVX512F-NEXT: shrl $8, %eax
+; AVX512F-NEXT: xorb %cl, %al
; AVX512F-NEXT: setnp %al
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -409,8 +409,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind {
; AVX512BW-NEXT: vpmovw2m %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: xorb %al, %cl
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: xorb %cl, %al
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -421,8 +421,8 @@ define i1 @trunc_v16i16_v16i1(<16 x i16>) nounwind {
; AVX512VL-NEXT: vpmovw2m %ymm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: shrl $8, %ecx
-; AVX512VL-NEXT: xorb %al, %cl
+; AVX512VL-NEXT: shrl $8, %eax
+; AVX512VL-NEXT: xorb %cl, %al
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -722,8 +722,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind {
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl $8, %ecx
-; AVX512F-NEXT: xorb %al, %cl
+; AVX512F-NEXT: shrl $8, %eax
+; AVX512F-NEXT: xorb %cl, %al
; AVX512F-NEXT: setnp %al
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -734,8 +734,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind {
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: xorb %al, %cl
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: xorb %cl, %al
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -746,8 +746,8 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) nounwind {
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: shrl $8, %ecx
-; AVX512VL-NEXT: xorb %al, %cl
+; AVX512VL-NEXT: shrl $8, %eax
+; AVX512VL-NEXT: xorb %cl, %al
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -974,13 +974,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind {
; AVX512BW-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: shrq $32, %rcx
-; AVX512BW-NEXT: xorl %eax, %ecx
-; AVX512BW-NEXT: movl %ecx, %eax
-; AVX512BW-NEXT: shrl $16, %eax
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrq $32, %rax
; AVX512BW-NEXT: xorl %ecx, %eax
-; AVX512BW-NEXT: xorb %ah, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrl $16, %ecx
+; AVX512BW-NEXT: xorl %eax, %ecx
+; AVX512BW-NEXT: xorb %ch, %cl
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -990,13 +990,13 @@ define i1 @trunc_v64i8_v64i1(<64 x i8>) nounwind {
; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovb2m %zmm0, %k0
; AVX512VL-NEXT: kmovq %k0, %rax
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $32, %rcx
-; AVX512VL-NEXT: xorl %eax, %ecx
-; AVX512VL-NEXT: movl %ecx, %eax
-; AVX512VL-NEXT: shrl $16, %eax
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: shrq $32, %rax
; AVX512VL-NEXT: xorl %ecx, %eax
-; AVX512VL-NEXT: xorb %ah, %al
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: shrl $16, %ecx
+; AVX512VL-NEXT: xorl %eax, %ecx
+; AVX512VL-NEXT: xorb %ch, %cl
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -1211,8 +1211,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind {
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: xorb %al, %cl
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: xorb %cl, %al
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1222,8 +1222,8 @@ define i1 @icmp0_v16i8_v16i1(<16 x i8>) nounwind {
; AVX512VL-NEXT: vptestnmb %xmm0, %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: shrl $8, %ecx
-; AVX512VL-NEXT: xorb %al, %cl
+; AVX512VL-NEXT: shrl $8, %eax
+; AVX512VL-NEXT: xorb %cl, %al
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: retq
%a = icmp eq <16 x i8> %0, zeroinitializer
@@ -1427,8 +1427,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind {
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl $8, %ecx
-; AVX512F-NEXT: xorb %al, %cl
+; AVX512F-NEXT: shrl $8, %eax
+; AVX512F-NEXT: xorb %cl, %al
; AVX512F-NEXT: setnp %al
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -1439,8 +1439,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind {
; AVX512BW-NEXT: vptestnmw %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: xorb %al, %cl
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: xorb %cl, %al
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1450,8 +1450,8 @@ define i1 @icmp0_v16i16_v16i1(<16 x i16>) nounwind {
; AVX512VL-NEXT: vptestnmw %ymm0, %ymm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: shrl $8, %ecx
-; AVX512VL-NEXT: xorb %al, %cl
+; AVX512VL-NEXT: shrl $8, %eax
+; AVX512VL-NEXT: xorb %cl, %al
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -1756,8 +1756,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl $8, %ecx
-; AVX512F-NEXT: xorb %al, %cl
+; AVX512F-NEXT: shrl $8, %eax
+; AVX512F-NEXT: xorb %cl, %al
; AVX512F-NEXT: setnp %al
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -1767,8 +1767,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: xorb %al, %cl
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: xorb %cl, %al
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1778,8 +1778,8 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: shrl $8, %ecx
-; AVX512VL-NEXT: xorb %al, %cl
+; AVX512VL-NEXT: shrl $8, %eax
+; AVX512VL-NEXT: xorb %cl, %al
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -2010,13 +2010,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: shrq $32, %rcx
-; AVX512BW-NEXT: xorl %eax, %ecx
-; AVX512BW-NEXT: movl %ecx, %eax
-; AVX512BW-NEXT: shrl $16, %eax
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrq $32, %rax
; AVX512BW-NEXT: xorl %ecx, %eax
-; AVX512BW-NEXT: xorb %ah, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrl $16, %ecx
+; AVX512BW-NEXT: xorl %eax, %ecx
+; AVX512BW-NEXT: xorb %ch, %cl
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -2025,13 +2025,13 @@ define i1 @icmp0_v64i8_v64i1(<64 x i8>) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512VL-NEXT: kmovq %k0, %rax
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $32, %rcx
-; AVX512VL-NEXT: xorl %eax, %ecx
-; AVX512VL-NEXT: movl %ecx, %eax
-; AVX512VL-NEXT: shrl $16, %eax
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: shrq $32, %rax
; AVX512VL-NEXT: xorl %ecx, %eax
-; AVX512VL-NEXT: xorb %ah, %al
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: shrl $16, %ecx
+; AVX512VL-NEXT: xorl %eax, %ecx
+; AVX512VL-NEXT: xorb %ch, %cl
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -2240,8 +2240,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind {
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: xorb %al, %cl
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: xorb %cl, %al
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -2251,8 +2251,8 @@ define i1 @icmp_v16i8_v16i1(<16 x i8>, <16 x i8>) nounwind {
; AVX512VL-NEXT: vpcmpeqb %xmm1, %xmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: shrl $8, %ecx
-; AVX512VL-NEXT: xorb %al, %cl
+; AVX512VL-NEXT: shrl $8, %eax
+; AVX512VL-NEXT: xorb %cl, %al
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: retq
%a = icmp eq <16 x i8> %0, %1
@@ -2504,8 +2504,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind {
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl $8, %ecx
-; AVX512F-NEXT: xorb %al, %cl
+; AVX512F-NEXT: shrl $8, %eax
+; AVX512F-NEXT: xorb %cl, %al
; AVX512F-NEXT: setnp %al
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -2517,8 +2517,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind {
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: xorb %al, %cl
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: xorb %cl, %al
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -2528,8 +2528,8 @@ define i1 @icmp_v16i16_v16i1(<16 x i16>, <16 x i16>) nounwind {
; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: shrl $8, %ecx
-; AVX512VL-NEXT: xorb %al, %cl
+; AVX512VL-NEXT: shrl $8, %eax
+; AVX512VL-NEXT: xorb %cl, %al
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -2845,8 +2845,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind {
; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512F-NEXT: kmovw %k0, %eax
; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: shrl $8, %ecx
-; AVX512F-NEXT: xorb %al, %cl
+; AVX512F-NEXT: shrl $8, %eax
+; AVX512F-NEXT: xorb %cl, %al
; AVX512F-NEXT: setnp %al
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -2856,8 +2856,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind {
; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: movl %eax, %ecx
-; AVX512BW-NEXT: shrl $8, %ecx
-; AVX512BW-NEXT: xorb %al, %cl
+; AVX512BW-NEXT: shrl $8, %eax
+; AVX512BW-NEXT: xorb %cl, %al
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -2867,8 +2867,8 @@ define i1 @icmp_v16i32_v16i1(<16 x i32>, <16 x i32>) nounwind {
; AVX512VL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: shrl $8, %ecx
-; AVX512VL-NEXT: xorb %al, %cl
+; AVX512VL-NEXT: shrl $8, %eax
+; AVX512VL-NEXT: xorb %cl, %al
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -3097,13 +3097,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
-; AVX512BW-NEXT: movq %rax, %rcx
-; AVX512BW-NEXT: shrq $32, %rcx
-; AVX512BW-NEXT: xorl %eax, %ecx
-; AVX512BW-NEXT: movl %ecx, %eax
-; AVX512BW-NEXT: shrl $16, %eax
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrq $32, %rax
; AVX512BW-NEXT: xorl %ecx, %eax
-; AVX512BW-NEXT: xorb %ah, %al
+; AVX512BW-NEXT: movl %eax, %ecx
+; AVX512BW-NEXT: shrl $16, %ecx
+; AVX512BW-NEXT: xorl %eax, %ecx
+; AVX512BW-NEXT: xorb %ch, %cl
; AVX512BW-NEXT: setnp %al
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -3112,13 +3112,13 @@ define i1 @icmp_v64i8_v64i1(<64 x i8>, <64 x i8>) nounwind {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512VL-NEXT: kmovq %k0, %rax
-; AVX512VL-NEXT: movq %rax, %rcx
-; AVX512VL-NEXT: shrq $32, %rcx
-; AVX512VL-NEXT: xorl %eax, %ecx
-; AVX512VL-NEXT: movl %ecx, %eax
-; AVX512VL-NEXT: shrl $16, %eax
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: shrq $32, %rax
; AVX512VL-NEXT: xorl %ecx, %eax
-; AVX512VL-NEXT: xorb %ah, %al
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: shrl $16, %ecx
+; AVX512VL-NEXT: xorl %eax, %ecx
+; AVX512VL-NEXT: xorb %ch, %cl
; AVX512VL-NEXT: setnp %al
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index a768baa..466fa6b 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -5890,17 +5890,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512DQ-SLOW: # %bb.0:
; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0
; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
-; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
-; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-SLOW-NEXT: movw $255, %ax
-; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
-; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
-; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2
-; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
-; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1
+; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1]
+; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT: kxnorb %k0, %k0, %k2
+; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2}
+; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512DQ-SLOW-NEXT: vzeroupper
@@ -5910,17 +5909,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512DQ-FAST: # %bb.0:
; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0
; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
-; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1
-; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-FAST-NEXT: movw $255, %ax
-; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
-; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
-; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2
-; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
-; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
+; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
+; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1
+; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
+; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: kxnorb %k0, %k0, %k2
+; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2}
+; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx)
; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 64(%rdx)
; AVX512DQ-FAST-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 0fb0420..aff2228 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
-; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
-; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
-; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
-; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT: kmovq %rax, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index c5d3297..7c1a531 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1931,31 +1931,28 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE-LABEL: constant_shift_v8i8:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: psraw $8, %xmm0
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -1977,7 +1974,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -2003,14 +2001,12 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
;
; X86-SSE-LABEL: constant_shift_v8i8:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2]
; X86-SSE-NEXT: psrlw $8, %xmm0
-; X86-SSE-NEXT: packuswb %xmm2, %xmm0
+; X86-SSE-NEXT: pxor %xmm1, %xmm1
+; X86-SSE-NEXT: packuswb %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
ret <8 x i8> %shift
@@ -2019,31 +2015,28 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE-LABEL: constant_shift_v4i8:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: psraw $8, %xmm0
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256]
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u]
; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -2065,7 +2058,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v4i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -2091,14 +2084,12 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
;
; X86-SSE-LABEL: constant_shift_v4i8:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u]
; X86-SSE-NEXT: psrlw $8, %xmm0
-; X86-SSE-NEXT: packuswb %xmm2, %xmm0
+; X86-SSE-NEXT: pxor %xmm1, %xmm1
+; X86-SSE-NEXT: packuswb %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
ret <4 x i8> %shift
@@ -2107,31 +2098,28 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE-LABEL: constant_shift_v2i8:
; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE-NEXT: psraw $8, %xmm0
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u]
; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm1
+; SSE-NEXT: packuswb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -2153,7 +2141,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v2i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -2179,14 +2167,12 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
;
; X86-SSE-LABEL: constant_shift_v2i8:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; X86-SSE-NEXT: psraw $8, %xmm0
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u]
; X86-SSE-NEXT: psrlw $8, %xmm0
-; X86-SSE-NEXT: packuswb %xmm2, %xmm0
+; X86-SSE-NEXT: pxor %xmm1, %xmm1
+; X86-SSE-NEXT: packuswb %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = ashr <2 x i8> %a, <i8 2, i8 3>
ret <2 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 103d570..4450d07 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index eb39b6a..e6eb4d7 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1617,39 +1617,34 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v8i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2]
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v8i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v8i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -1671,7 +1666,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -1698,12 +1694,10 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v8i8:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2]
; X86-SSE-NEXT: psrlw $8, %xmm0
-; X86-SSE-NEXT: packuswb %xmm2, %xmm0
+; X86-SSE-NEXT: packuswb %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
ret <8 x i8> %shift
@@ -1713,39 +1707,34 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u]
; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v4i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,256,256,256,256]
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,u,u,u,u]
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v4i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,u,u,u,u]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v4i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -1767,7 +1756,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v4i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -1794,12 +1783,10 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v4i8:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,u,u,u,u]
; X86-SSE-NEXT: psrlw $8, %xmm0
-; X86-SSE-NEXT: packuswb %xmm2, %xmm0
+; X86-SSE-NEXT: packuswb %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
ret <4 x i8> %shift
@@ -1809,39 +1796,34 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u]
; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: constant_shift_v2i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,256,256,256,256,256,256]
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,u,u,u,u,u,u]
+; SSE41-NEXT: psrlw $8, %xmm0
+; SSE41-NEXT: pxor %xmm1, %xmm1
+; SSE41-NEXT: packuswb %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: constant_shift_v2i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,u,u,u,u,u,u]
; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: constant_shift_v2i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -1863,7 +1845,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v2i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -1890,12 +1872,10 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v2i8:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,u,u,u,u,u,u]
; X86-SSE-NEXT: psrlw $8, %xmm0
-; X86-SSE-NEXT: packuswb %xmm2, %xmm0
+; X86-SSE-NEXT: packuswb %xmm1, %xmm0
; X86-SSE-NEXT: retl
%shift = lshr <2 x i8> %a, <i8 2, i8 3>
ret <2 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index 3085c32..efe80b4 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -1151,11 +1151,11 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,64,32,16,8,4,2,1]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,u,64,u,32,u,16,u,8,u,4,u,2,u,1,u]
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm2, %xmm1
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u]
; SSE2-NEXT: pand %xmm2, %xmm0
; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: retq
@@ -1165,7 +1165,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1174,7 +1174,7 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX1: # %bb.0:
; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -1232,11 +1232,11 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movdqa %xmm0, %xmm1
; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [128,64,32,16,8,4,2,1]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [128,u,64,u,32,u,16,u,8,u,4,u,2,u,1,u]
; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
; X86-SSE-NEXT: pand %xmm2, %xmm1
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u]
; X86-SSE-NEXT: pand %xmm2, %xmm0
; X86-SSE-NEXT: packuswb %xmm1, %xmm0
; X86-SSE-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index f9ccd1e..c7d2532 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -1313,9 +1313,9 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
-; AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1325,7 +1325,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX2: # %bb.0:
; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -1352,7 +1352,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX512DQ-NEXT: vpsllw $8, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: retq
@@ -1366,7 +1366,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
;
; AVX512DQVL-LABEL: constant_shift_v32i8:
; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX512DQVL-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX512DQVL-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX512DQVL-NEXT: vpsllw $8, %ymm0, %ymm0
; AVX512DQVL-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 | (ymm1 & m32bcst)
@@ -1388,9 +1388,9 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; X86-AVX1-NEXT: vpmaddubsw %xmm1, %xmm3, %xmm1
; X86-AVX1-NEXT: vpsllw $8, %xmm1, %xmm1
; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; X86-AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = [1,4,16,64,128,32,8,2]
-; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm3, %xmm3
-; X86-AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
+; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
+; X86-AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm3
+; X86-AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; X86-AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
@@ -1400,7 +1400,7 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
; X86-AVX2: # %bb.0:
; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; X86-AVX2-NEXT: vpsllw $8, %ymm1, %ymm1
-; X86-AVX2-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X86-AVX2-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index efd7429..1e5f1b8 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -312,10 +307,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512DQ-LABEL: constant_shift_v64i8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm1, %ymm3
-; AVX512DQ-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm1, %ymm3
+; AVX512DQ-NEXT: vpmullw %ymm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
@@ -329,7 +324,7 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
+; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1,1,2,4,8,16,32,64,128,128,64,32,16,8,4,2,1]
; AVX512BW-NEXT: vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & m32bcst)
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index d245bdc..07e6c36 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -1429,7 +1429,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v8i8:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm0
@@ -1438,7 +1438,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
; SSE41-LABEL: constant_shift_v8i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: packuswb %xmm1, %xmm0
@@ -1447,7 +1447,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
; AVX1-LABEL: constant_shift_v8i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -1478,7 +1478,8 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v8i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -1505,7 +1506,7 @@ define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v8i8:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,u,2,u,4,u,8,u,16,u,32,u,64,u,128,u]
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: packuswb %xmm1, %xmm0
@@ -1518,7 +1519,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v4i8:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,u,u,u,u]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,u,u,u,u,u,u,u,u]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm0
@@ -1527,7 +1528,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; SSE41-LABEL: constant_shift_v4i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,u,u,u,u]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,u,2,u,4,u,8,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: packuswb %xmm1, %xmm0
@@ -1536,7 +1537,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; AVX1-LABEL: constant_shift_v4i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,u,u,u,u]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,u,2,u,4,u,8,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -1567,7 +1568,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v4i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -1594,7 +1595,7 @@ define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v4i8:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,u,u,u,u]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,u,2,u,4,u,8,u,u,u,u,u,u,u,u,u]
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: packuswb %xmm1, %xmm0
@@ -1607,7 +1608,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE2-LABEL: constant_shift_v2i8:
; SSE2: # %bb.0:
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,u,u,u,u,u,u]
+; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,u,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm0
@@ -1616,7 +1617,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; SSE41-LABEL: constant_shift_v2i8:
; SSE41: # %bb.0:
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,8,u,u,u,u,u,u]
+; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [4,u,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: packuswb %xmm1, %xmm0
@@ -1625,7 +1626,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; AVX1-LABEL: constant_shift_v2i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,8,u,u,u,u,u,u]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4,u,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
@@ -1656,7 +1657,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
;
; AVX512BW-LABEL: constant_shift_v2i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
@@ -1683,7 +1684,7 @@ define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
; X86-SSE-LABEL: constant_shift_v2i8:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,8,u,u,u,u,u,u]
+; X86-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [4,u,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X86-SSE-NEXT: pxor %xmm1, %xmm1
; X86-SSE-NEXT: packuswb %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index c33776d..b79d9e8c 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -22,6 +22,9 @@ declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x
declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
+declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
+
define <8 x double> @combine_permvar_8f64_identity(<8 x double> %x0, <8 x double> %x1) {
; CHECK-LABEL: combine_permvar_8f64_identity:
; CHECK: # %bb.0:
@@ -1031,3 +1034,24 @@ define <8 x double> @concat_vpermilvar_v8f64_v4f64(<4 x double> %a0, <4 x double
%res = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %res
}
+
+; shift elements up by one
+define <16 x i32> @combine_vexpandd_as_valignd(<16 x i32> %x) {
+; CHECK-LABEL: combine_vexpandd_as_valignd:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: valignd {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %x, <16 x i32> zeroinitializer, <16 x i1> <i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+ ret <16 x i32> %res
+}
+
+; zero upper half of vector
+define <16 x i32> @combine_vcompressd_as_vmov(<16 x i32> %x) {
+; CHECK-LABEL: combine_vcompressd_as_vmov:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vmovaps %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %res = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %x, <16 x i32> zeroinitializer, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>)
+ ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index 3590c4d..ac58306 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -100,16 +100,14 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
; SSE-NEXT: pshufb %xmm3, %xmm4
; SSE-NEXT: pshufb %xmm8, %xmm1
; SSE-NEXT: por %xmm4, %xmm1
-; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: pand %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm0, %xmm4
-; SSE-NEXT: pmaddubsw %xmm3, %xmm4
-; SSE-NEXT: pand %xmm2, %xmm4
-; SSE-NEXT: pandn %xmm1, %xmm2
-; SSE-NEXT: pmaddubsw %xmm2, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pmullw %xmm1, %xmm2
+; SSE-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT: pand %xmm3, %xmm2
+; SSE-NEXT: pandn %xmm1, %xmm3
+; SSE-NEXT: pmaddubsw %xmm3, %xmm0
; SSE-NEXT: psllw $8, %xmm0
-; SSE-NEXT: por %xmm4, %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: PR50049:
@@ -129,21 +127,20 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1
-; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmaddubsw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR50049:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index ee9d8a5..35e1c5a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3575,21 +3575,17 @@ define void @SpinningCube() {
; SSE2-NEXT: xorps %xmm0, %xmm0
; SSE2-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSE2-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSE2-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u]
-; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
-; SSE2-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0]
-; SSE2-NEXT: addps %xmm0, %xmm3
-; SSE2-NEXT: movaps %xmm3, (%rax)
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: addps %xmm2, %xmm0
-; SSE2-NEXT: movaps %xmm0, (%rax)
+; SSE2-NEXT: movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
+; SSE2-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT: movaps %xmm2, (%rax)
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: addps %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, (%rax)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: SpinningCube:
@@ -3598,54 +3594,43 @@ define void @SpinningCube() {
; SSSE3-NEXT: xorps %xmm0, %xmm0
; SSSE3-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1]
-; SSSE3-NEXT: xorps %xmm2, %xmm2
-; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0]
-; SSSE3-NEXT: movss {{.*#+}} xmm3 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSSE3-NEXT: movapd {{.*#+}} xmm4 = [u,u,-2.0E+0,u]
-; SSSE3-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0]
-; SSSE3-NEXT: movq {{.*#+}} xmm3 = xmm3[0],zero
-; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,0]
-; SSSE3-NEXT: addps %xmm0, %xmm3
-; SSSE3-NEXT: movaps %xmm3, (%rax)
-; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,2]
-; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT: addps %xmm2, %xmm0
-; SSSE3-NEXT: movaps %xmm0, (%rax)
+; SSSE3-NEXT: movd {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
+; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero
+; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0]
+; SSSE3-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSSE3-NEXT: movaps %xmm2, (%rax)
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2]
+; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: addps %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, (%rax)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: SpinningCube:
; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,mem[0]
-; SSE41-NEXT: movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u]
-; SSE41-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSE41-NEXT: movaps %xmm1, %xmm3
-; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
-; SSE41-NEXT: movaps %xmm0, %xmm4
-; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3]
-; SSE41-NEXT: addps %xmm3, %xmm4
-; SSE41-NEXT: movaps %xmm4, (%rax)
-; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2]
-; SSE41-NEXT: mulps %xmm1, %xmm2
-; SSE41-NEXT: addps %xmm0, %xmm2
-; SSE41-NEXT: movaps %xmm2, (%rax)
+; SSE41-NEXT: movaps %xmm0, %xmm1
+; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; SSE41-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
+; SSE41-NEXT: movaps %xmm1, (%rax)
+; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2]
+; SSE41-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT: addps %xmm0, %xmm1
+; SSE41-NEXT: movaps %xmm1, (%rax)
; SSE41-NEXT: retq
;
; AVX-LABEL: SpinningCube:
; AVX: # %bb.0: # %entry
; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0.0E+0,0.0E+0,0.0E+0,1.0E+0]
-; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u]
-; AVX-NEXT: vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
-; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vmovaps %xmm2, (%rax)
-; AVX-NEXT: vbroadcastss (%rax), %xmm2
-; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vmovaps %xmm1, (%rax)
+; AVX-NEXT: vbroadcastss (%rax), %xmm1
+; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovaps %xmm0, (%rax)
; AVX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/issue163738.ll b/llvm/test/CodeGen/X86/vpternlog.ll
index 61fe043..bd7478d 100644
--- a/llvm/test/CodeGen/X86/issue163738.ll
+++ b/llvm/test/CodeGen/X86/vpternlog.ll
@@ -11,3 +11,15 @@ define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
%and3 = xor <8 x i64> %and3.demorgan, splat (i64 -1)
ret <8 x i64> %and3
}
+
+define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) {
+; CHECK-LABEL: xorbitcast:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT: retq
+ %or1 = or <64 x i8> %a, %b
+ %or2 = or <64 x i8> %or1, %c
+ %cast = bitcast <64 x i8> %or2 to <8 x i64>
+ %xor = xor <8 x i64> %cast, splat (i64 -1)
+ ret <8 x i64> %xor
+}
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 3c98eba6..65b6028 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -1,36 +1,36 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK0
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK1
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK3
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK4
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK5
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK6
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK7
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK8
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK9
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK10
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK11
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK12
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK13
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK14
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK15
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK16
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK17
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK18
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK19
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK20
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK21
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK22
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK23
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK24
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK25
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK26
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK27
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK28
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK29
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK30
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK31
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-SHLD-NO-BMI2-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-SSE4
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-SSE4
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-SSE4
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-SSE4
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-SHLD-NO-BMI2-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX1
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX512
define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: lshr_4bytes:
@@ -646,784 +646,596 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; FALLBACK16-LABEL: lshr_16bytes:
-; FALLBACK16: # %bb.0:
-; FALLBACK16-NEXT: pushl %ebp
-; FALLBACK16-NEXT: pushl %ebx
-; FALLBACK16-NEXT: pushl %edi
-; FALLBACK16-NEXT: pushl %esi
-; FALLBACK16-NEXT: subl $60, %esp
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT: movl (%ecx), %edx
-; FALLBACK16-NEXT: movl 4(%ecx), %esi
-; FALLBACK16-NEXT: movl 8(%ecx), %edi
-; FALLBACK16-NEXT: movl 12(%ecx), %ecx
-; FALLBACK16-NEXT: movb (%eax), %ah
-; FALLBACK16-NEXT: movb %ah, %al
-; FALLBACK16-NEXT: shlb $3, %al
-; FALLBACK16-NEXT: xorps %xmm0, %xmm0
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: andb $12, %ah
-; FALLBACK16-NEXT: movzbl %ah, %ebp
-; FALLBACK16-NEXT: movl 20(%esp,%ebp), %esi
-; FALLBACK16-NEXT: movl %esi, %ebx
-; FALLBACK16-NEXT: movl %eax, %ecx
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: movl %eax, %edx
-; FALLBACK16-NEXT: notb %dl
-; FALLBACK16-NEXT: movl 24(%esp,%ebp), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: orl %ebx, %edi
-; FALLBACK16-NEXT: movl 16(%esp,%ebp), %ebx
-; FALLBACK16-NEXT: movl %eax, %ecx
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: addl %esi, %esi
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: orl %ebx, %esi
-; FALLBACK16-NEXT: movl %eax, %ecx
-; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; FALLBACK16-NEXT: movl 28(%esp,%ebp), %ebx
-; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK16-NEXT: movl %eax, %ecx
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: movl %ebx, 12(%edx)
-; FALLBACK16-NEXT: movl %ebp, 8(%edx)
-; FALLBACK16-NEXT: movl %esi, (%edx)
-; FALLBACK16-NEXT: movl %edi, 4(%edx)
-; FALLBACK16-NEXT: addl $60, %esp
-; FALLBACK16-NEXT: popl %esi
-; FALLBACK16-NEXT: popl %edi
-; FALLBACK16-NEXT: popl %ebx
-; FALLBACK16-NEXT: popl %ebp
-; FALLBACK16-NEXT: retl
-;
-; FALLBACK17-LABEL: lshr_16bytes:
-; FALLBACK17: # %bb.0:
-; FALLBACK17-NEXT: pushl %ebp
-; FALLBACK17-NEXT: pushl %ebx
-; FALLBACK17-NEXT: pushl %edi
-; FALLBACK17-NEXT: pushl %esi
-; FALLBACK17-NEXT: subl $44, %esp
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK17-NEXT: movl (%edx), %esi
-; FALLBACK17-NEXT: movl 4(%edx), %edi
-; FALLBACK17-NEXT: movl 8(%edx), %ebx
-; FALLBACK17-NEXT: movl 12(%edx), %edx
-; FALLBACK17-NEXT: movb (%ecx), %ch
-; FALLBACK17-NEXT: movb %ch, %cl
-; FALLBACK17-NEXT: shlb $3, %cl
-; FALLBACK17-NEXT: xorps %xmm0, %xmm0
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, (%esp)
-; FALLBACK17-NEXT: andb $12, %ch
-; FALLBACK17-NEXT: movzbl %ch, %ebx
-; FALLBACK17-NEXT: movl 8(%esp,%ebx), %esi
-; FALLBACK17-NEXT: movl (%esp,%ebx), %edx
-; FALLBACK17-NEXT: movl 4(%esp,%ebx), %ebp
-; FALLBACK17-NEXT: movl %ebp, %edi
-; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
-; FALLBACK17-NEXT: movl 12(%esp,%ebx), %ebx
-; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
-; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx
-; FALLBACK17-NEXT: shrl %cl, %ebx
-; FALLBACK17-NEXT: movl %esi, 8(%eax)
-; FALLBACK17-NEXT: movl %ebx, 12(%eax)
-; FALLBACK17-NEXT: movl %edx, (%eax)
-; FALLBACK17-NEXT: movl %edi, 4(%eax)
-; FALLBACK17-NEXT: addl $44, %esp
-; FALLBACK17-NEXT: popl %esi
-; FALLBACK17-NEXT: popl %edi
-; FALLBACK17-NEXT: popl %ebx
-; FALLBACK17-NEXT: popl %ebp
-; FALLBACK17-NEXT: retl
-;
-; FALLBACK18-LABEL: lshr_16bytes:
-; FALLBACK18: # %bb.0:
-; FALLBACK18-NEXT: pushl %ebp
-; FALLBACK18-NEXT: pushl %ebx
-; FALLBACK18-NEXT: pushl %edi
-; FALLBACK18-NEXT: pushl %esi
-; FALLBACK18-NEXT: subl $44, %esp
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK18-NEXT: movl (%ecx), %edx
-; FALLBACK18-NEXT: movl 4(%ecx), %esi
-; FALLBACK18-NEXT: movl 8(%ecx), %edi
-; FALLBACK18-NEXT: movl 12(%ecx), %ecx
-; FALLBACK18-NEXT: movzbl (%eax), %ebx
-; FALLBACK18-NEXT: movl %ebx, %eax
-; FALLBACK18-NEXT: shlb $3, %al
-; FALLBACK18-NEXT: xorps %xmm0, %xmm0
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edx, (%esp)
-; FALLBACK18-NEXT: andb $12, %bl
-; FALLBACK18-NEXT: movzbl %bl, %esi
-; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi
-; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx
-; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp
-; FALLBACK18-NEXT: movl %eax, %edx
-; FALLBACK18-NEXT: notb %dl
-; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx
-; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
-; FALLBACK18-NEXT: orl %ebp, %ecx
-; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %edx, %edi, %edi
-; FALLBACK18-NEXT: orl %ebp, %edi
-; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx
-; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi
-; FALLBACK18-NEXT: shrxl %eax, %esi, %eax
-; FALLBACK18-NEXT: addl %esi, %esi
-; FALLBACK18-NEXT: shlxl %edx, %esi, %edx
-; FALLBACK18-NEXT: orl %ebx, %edx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK18-NEXT: movl %eax, 12(%esi)
-; FALLBACK18-NEXT: movl %edx, 8(%esi)
-; FALLBACK18-NEXT: movl %edi, (%esi)
-; FALLBACK18-NEXT: movl %ecx, 4(%esi)
-; FALLBACK18-NEXT: addl $44, %esp
-; FALLBACK18-NEXT: popl %esi
-; FALLBACK18-NEXT: popl %edi
-; FALLBACK18-NEXT: popl %ebx
-; FALLBACK18-NEXT: popl %ebp
-; FALLBACK18-NEXT: retl
-;
-; FALLBACK19-LABEL: lshr_16bytes:
-; FALLBACK19: # %bb.0:
-; FALLBACK19-NEXT: pushl %ebp
-; FALLBACK19-NEXT: pushl %ebx
-; FALLBACK19-NEXT: pushl %edi
-; FALLBACK19-NEXT: pushl %esi
-; FALLBACK19-NEXT: subl $44, %esp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK19-NEXT: movl (%edx), %esi
-; FALLBACK19-NEXT: movl 4(%edx), %edi
-; FALLBACK19-NEXT: movl 8(%edx), %ebx
-; FALLBACK19-NEXT: movl 12(%edx), %edx
-; FALLBACK19-NEXT: movzbl (%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, %ecx
-; FALLBACK19-NEXT: shlb $3, %cl
-; FALLBACK19-NEXT: xorps %xmm0, %xmm0
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, (%esp)
-; FALLBACK19-NEXT: andb $12, %al
-; FALLBACK19-NEXT: movzbl %al, %eax
-; FALLBACK19-NEXT: movl 8(%esp,%eax), %ebx
-; FALLBACK19-NEXT: movl (%esp,%eax), %edx
-; FALLBACK19-NEXT: movl 4(%esp,%eax), %esi
-; FALLBACK19-NEXT: movl %esi, %edi
-; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi
-; FALLBACK19-NEXT: movl 12(%esp,%eax), %eax
-; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK19-NEXT: movl %ebx, 8(%ebp)
-; FALLBACK19-NEXT: shrxl %ecx, %eax, %eax
-; FALLBACK19-NEXT: movl %eax, 12(%ebp)
-; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK19-NEXT: movl %edx, (%ebp)
-; FALLBACK19-NEXT: movl %edi, 4(%ebp)
-; FALLBACK19-NEXT: addl $44, %esp
-; FALLBACK19-NEXT: popl %esi
-; FALLBACK19-NEXT: popl %edi
-; FALLBACK19-NEXT: popl %ebx
-; FALLBACK19-NEXT: popl %ebp
-; FALLBACK19-NEXT: retl
-;
-; FALLBACK20-LABEL: lshr_16bytes:
-; FALLBACK20: # %bb.0:
-; FALLBACK20-NEXT: pushl %ebp
-; FALLBACK20-NEXT: pushl %ebx
-; FALLBACK20-NEXT: pushl %edi
-; FALLBACK20-NEXT: pushl %esi
-; FALLBACK20-NEXT: subl $60, %esp
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: movups (%ecx), %xmm0
-; FALLBACK20-NEXT: movzbl (%eax), %ecx
-; FALLBACK20-NEXT: movl %ecx, %eax
-; FALLBACK20-NEXT: shlb $3, %al
-; FALLBACK20-NEXT: xorps %xmm1, %xmm1
-; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: andb $12, %cl
-; FALLBACK20-NEXT: movzbl %cl, %edi
-; FALLBACK20-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK20-NEXT: movl 20(%esp,%edi), %esi
-; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: movl %eax, %edx
-; FALLBACK20-NEXT: notb %dl
-; FALLBACK20-NEXT: addl %esi, %esi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: orl %ebx, %esi
-; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %esi
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %esi
-; FALLBACK20-NEXT: movl 28(%esp,%edi), %edi
-; FALLBACK20-NEXT: leal (%edi,%edi), %ebp
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebp
-; FALLBACK20-NEXT: orl %esi, %ebp
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %esi
-; FALLBACK20-NEXT: addl %ebx, %ebx
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %esi, %ebx
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: movl %edi, 12(%edx)
-; FALLBACK20-NEXT: movl %ebx, 4(%edx)
-; FALLBACK20-NEXT: movl %ebp, 8(%edx)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: movl %eax, (%edx)
-; FALLBACK20-NEXT: addl $60, %esp
-; FALLBACK20-NEXT: popl %esi
-; FALLBACK20-NEXT: popl %edi
-; FALLBACK20-NEXT: popl %ebx
-; FALLBACK20-NEXT: popl %ebp
-; FALLBACK20-NEXT: retl
-;
-; FALLBACK21-LABEL: lshr_16bytes:
-; FALLBACK21: # %bb.0:
-; FALLBACK21-NEXT: pushl %ebp
-; FALLBACK21-NEXT: pushl %ebx
-; FALLBACK21-NEXT: pushl %edi
-; FALLBACK21-NEXT: pushl %esi
-; FALLBACK21-NEXT: subl $44, %esp
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK21-NEXT: movups (%edx), %xmm0
-; FALLBACK21-NEXT: movzbl (%ecx), %edx
-; FALLBACK21-NEXT: movl %edx, %ecx
-; FALLBACK21-NEXT: shlb $3, %cl
-; FALLBACK21-NEXT: xorps %xmm1, %xmm1
-; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm0, (%esp)
-; FALLBACK21-NEXT: andb $12, %dl
-; FALLBACK21-NEXT: movzbl %dl, %ebx
-; FALLBACK21-NEXT: movl 12(%esp,%ebx), %edx
-; FALLBACK21-NEXT: movl 8(%esp,%ebx), %ebp
-; FALLBACK21-NEXT: movl %ebp, %edi
-; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK21-NEXT: movl (%esp,%ebx), %esi
-; FALLBACK21-NEXT: movl 4(%esp,%ebx), %eax
-; FALLBACK21-NEXT: movl %eax, %ebx
-; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK21-NEXT: movl %ebx, 4(%ebp)
-; FALLBACK21-NEXT: movl %edi, 8(%ebp)
-; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK21-NEXT: shrl %cl, %edx
-; FALLBACK21-NEXT: movl %edx, 12(%ebp)
-; FALLBACK21-NEXT: movl %esi, (%ebp)
-; FALLBACK21-NEXT: addl $44, %esp
-; FALLBACK21-NEXT: popl %esi
-; FALLBACK21-NEXT: popl %edi
-; FALLBACK21-NEXT: popl %ebx
-; FALLBACK21-NEXT: popl %ebp
-; FALLBACK21-NEXT: retl
-;
-; FALLBACK22-LABEL: lshr_16bytes:
-; FALLBACK22: # %bb.0:
-; FALLBACK22-NEXT: pushl %ebp
-; FALLBACK22-NEXT: pushl %ebx
-; FALLBACK22-NEXT: pushl %edi
-; FALLBACK22-NEXT: pushl %esi
-; FALLBACK22-NEXT: subl $44, %esp
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK22-NEXT: movups (%ecx), %xmm0
-; FALLBACK22-NEXT: movzbl (%eax), %ecx
-; FALLBACK22-NEXT: movl %ecx, %eax
-; FALLBACK22-NEXT: shlb $3, %al
-; FALLBACK22-NEXT: xorps %xmm1, %xmm1
-; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm0, (%esp)
-; FALLBACK22-NEXT: andb $12, %cl
-; FALLBACK22-NEXT: movzbl %cl, %edi
-; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx
-; FALLBACK22-NEXT: movl %eax, %ecx
-; FALLBACK22-NEXT: notb %cl
-; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp
-; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi
-; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx
-; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx
-; FALLBACK22-NEXT: orl %ebx, %edx
-; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx
-; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp
-; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi
-; FALLBACK22-NEXT: shrxl %eax, %edi, %eax
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi
-; FALLBACK22-NEXT: orl %ebx, %edi
-; FALLBACK22-NEXT: addl %esi, %esi
-; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx
-; FALLBACK22-NEXT: orl %ebp, %ecx
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK22-NEXT: movl %eax, 12(%esi)
-; FALLBACK22-NEXT: movl %ecx, 4(%esi)
-; FALLBACK22-NEXT: movl %edi, 8(%esi)
-; FALLBACK22-NEXT: movl %edx, (%esi)
-; FALLBACK22-NEXT: addl $44, %esp
-; FALLBACK22-NEXT: popl %esi
-; FALLBACK22-NEXT: popl %edi
-; FALLBACK22-NEXT: popl %ebx
-; FALLBACK22-NEXT: popl %ebp
-; FALLBACK22-NEXT: retl
-;
-; FALLBACK23-LABEL: lshr_16bytes:
-; FALLBACK23: # %bb.0:
-; FALLBACK23-NEXT: pushl %ebp
-; FALLBACK23-NEXT: pushl %ebx
-; FALLBACK23-NEXT: pushl %edi
-; FALLBACK23-NEXT: pushl %esi
-; FALLBACK23-NEXT: subl $44, %esp
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK23-NEXT: movups (%edx), %xmm0
-; FALLBACK23-NEXT: movzbl (%ecx), %edx
-; FALLBACK23-NEXT: movl %edx, %ecx
-; FALLBACK23-NEXT: shlb $3, %cl
-; FALLBACK23-NEXT: xorps %xmm1, %xmm1
-; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm0, (%esp)
-; FALLBACK23-NEXT: andb $12, %dl
-; FALLBACK23-NEXT: movzbl %dl, %ebx
-; FALLBACK23-NEXT: movl 12(%esp,%ebx), %edx
-; FALLBACK23-NEXT: movl 8(%esp,%ebx), %ebp
-; FALLBACK23-NEXT: movl %ebp, %edi
-; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK23-NEXT: movl (%esp,%ebx), %esi
-; FALLBACK23-NEXT: movl 4(%esp,%ebx), %eax
-; FALLBACK23-NEXT: movl %eax, %ebx
-; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK23-NEXT: movl %ebx, 4(%ebp)
-; FALLBACK23-NEXT: movl %edi, 8(%ebp)
-; FALLBACK23-NEXT: shrxl %ecx, %edx, %edx
-; FALLBACK23-NEXT: movl %edx, 12(%ebp)
-; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK23-NEXT: movl %esi, (%ebp)
-; FALLBACK23-NEXT: addl $44, %esp
-; FALLBACK23-NEXT: popl %esi
-; FALLBACK23-NEXT: popl %edi
-; FALLBACK23-NEXT: popl %ebx
-; FALLBACK23-NEXT: popl %ebp
-; FALLBACK23-NEXT: retl
-;
-; FALLBACK24-LABEL: lshr_16bytes:
-; FALLBACK24: # %bb.0:
-; FALLBACK24-NEXT: pushl %ebp
-; FALLBACK24-NEXT: pushl %ebx
-; FALLBACK24-NEXT: pushl %edi
-; FALLBACK24-NEXT: pushl %esi
-; FALLBACK24-NEXT: subl $60, %esp
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK24-NEXT: movzbl (%eax), %ecx
-; FALLBACK24-NEXT: movl %ecx, %eax
-; FALLBACK24-NEXT: shlb $3, %al
-; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: andb $12, %cl
-; FALLBACK24-NEXT: movzbl %cl, %edi
-; FALLBACK24-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK24-NEXT: movl 20(%esp,%edi), %esi
-; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: movl %eax, %edx
-; FALLBACK24-NEXT: notb %dl
-; FALLBACK24-NEXT: addl %esi, %esi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: orl %ebx, %esi
-; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %esi
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %esi
-; FALLBACK24-NEXT: movl 28(%esp,%edi), %edi
-; FALLBACK24-NEXT: leal (%edi,%edi), %ebp
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebp
-; FALLBACK24-NEXT: orl %esi, %ebp
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %esi
-; FALLBACK24-NEXT: addl %ebx, %ebx
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %esi, %ebx
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: movl %edi, 12(%edx)
-; FALLBACK24-NEXT: movl %ebx, 4(%edx)
-; FALLBACK24-NEXT: movl %ebp, 8(%edx)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: movl %eax, (%edx)
-; FALLBACK24-NEXT: addl $60, %esp
-; FALLBACK24-NEXT: popl %esi
-; FALLBACK24-NEXT: popl %edi
-; FALLBACK24-NEXT: popl %ebx
-; FALLBACK24-NEXT: popl %ebp
-; FALLBACK24-NEXT: retl
-;
-; FALLBACK25-LABEL: lshr_16bytes:
-; FALLBACK25: # %bb.0:
-; FALLBACK25-NEXT: pushl %ebp
-; FALLBACK25-NEXT: pushl %ebx
-; FALLBACK25-NEXT: pushl %edi
-; FALLBACK25-NEXT: pushl %esi
-; FALLBACK25-NEXT: subl $44, %esp
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK25-NEXT: vmovups (%edx), %xmm0
-; FALLBACK25-NEXT: movzbl (%ecx), %edx
-; FALLBACK25-NEXT: movl %edx, %ecx
-; FALLBACK25-NEXT: shlb $3, %cl
-; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovaps %xmm0, (%esp)
-; FALLBACK25-NEXT: andb $12, %dl
-; FALLBACK25-NEXT: movzbl %dl, %ebx
-; FALLBACK25-NEXT: movl 12(%esp,%ebx), %edx
-; FALLBACK25-NEXT: movl 8(%esp,%ebx), %ebp
-; FALLBACK25-NEXT: movl %ebp, %edi
-; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK25-NEXT: movl (%esp,%ebx), %esi
-; FALLBACK25-NEXT: movl 4(%esp,%ebx), %eax
-; FALLBACK25-NEXT: movl %eax, %ebx
-; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK25-NEXT: movl %ebx, 4(%ebp)
-; FALLBACK25-NEXT: movl %edi, 8(%ebp)
-; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK25-NEXT: shrl %cl, %edx
-; FALLBACK25-NEXT: movl %edx, 12(%ebp)
-; FALLBACK25-NEXT: movl %esi, (%ebp)
-; FALLBACK25-NEXT: addl $44, %esp
-; FALLBACK25-NEXT: popl %esi
-; FALLBACK25-NEXT: popl %edi
-; FALLBACK25-NEXT: popl %ebx
-; FALLBACK25-NEXT: popl %ebp
-; FALLBACK25-NEXT: retl
-;
-; FALLBACK26-LABEL: lshr_16bytes:
-; FALLBACK26: # %bb.0:
-; FALLBACK26-NEXT: pushl %ebp
-; FALLBACK26-NEXT: pushl %ebx
-; FALLBACK26-NEXT: pushl %edi
-; FALLBACK26-NEXT: pushl %esi
-; FALLBACK26-NEXT: subl $44, %esp
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK26-NEXT: movzbl (%eax), %ecx
-; FALLBACK26-NEXT: movl %ecx, %eax
-; FALLBACK26-NEXT: shlb $3, %al
-; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovaps %xmm0, (%esp)
-; FALLBACK26-NEXT: andb $12, %cl
-; FALLBACK26-NEXT: movzbl %cl, %edi
-; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx
-; FALLBACK26-NEXT: movl %eax, %ecx
-; FALLBACK26-NEXT: notb %cl
-; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp
-; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi
-; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx
-; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx
-; FALLBACK26-NEXT: orl %ebx, %edx
-; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx
-; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp
-; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi
-; FALLBACK26-NEXT: shrxl %eax, %edi, %eax
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi
-; FALLBACK26-NEXT: orl %ebx, %edi
-; FALLBACK26-NEXT: addl %esi, %esi
-; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx
-; FALLBACK26-NEXT: orl %ebp, %ecx
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK26-NEXT: movl %eax, 12(%esi)
-; FALLBACK26-NEXT: movl %ecx, 4(%esi)
-; FALLBACK26-NEXT: movl %edi, 8(%esi)
-; FALLBACK26-NEXT: movl %edx, (%esi)
-; FALLBACK26-NEXT: addl $44, %esp
-; FALLBACK26-NEXT: popl %esi
-; FALLBACK26-NEXT: popl %edi
-; FALLBACK26-NEXT: popl %ebx
-; FALLBACK26-NEXT: popl %ebp
-; FALLBACK26-NEXT: retl
-;
-; FALLBACK27-LABEL: lshr_16bytes:
-; FALLBACK27: # %bb.0:
-; FALLBACK27-NEXT: pushl %ebp
-; FALLBACK27-NEXT: pushl %ebx
-; FALLBACK27-NEXT: pushl %edi
-; FALLBACK27-NEXT: pushl %esi
-; FALLBACK27-NEXT: subl $44, %esp
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK27-NEXT: vmovups (%edx), %xmm0
-; FALLBACK27-NEXT: movzbl (%ecx), %edx
-; FALLBACK27-NEXT: movl %edx, %ecx
-; FALLBACK27-NEXT: shlb $3, %cl
-; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovaps %xmm0, (%esp)
-; FALLBACK27-NEXT: andb $12, %dl
-; FALLBACK27-NEXT: movzbl %dl, %ebx
-; FALLBACK27-NEXT: movl 12(%esp,%ebx), %edx
-; FALLBACK27-NEXT: movl 8(%esp,%ebx), %ebp
-; FALLBACK27-NEXT: movl %ebp, %edi
-; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK27-NEXT: movl (%esp,%ebx), %esi
-; FALLBACK27-NEXT: movl 4(%esp,%ebx), %eax
-; FALLBACK27-NEXT: movl %eax, %ebx
-; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK27-NEXT: movl %ebx, 4(%ebp)
-; FALLBACK27-NEXT: movl %edi, 8(%ebp)
-; FALLBACK27-NEXT: shrxl %ecx, %edx, %edx
-; FALLBACK27-NEXT: movl %edx, 12(%ebp)
-; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK27-NEXT: movl %esi, (%ebp)
-; FALLBACK27-NEXT: addl $44, %esp
-; FALLBACK27-NEXT: popl %esi
-; FALLBACK27-NEXT: popl %edi
-; FALLBACK27-NEXT: popl %ebx
-; FALLBACK27-NEXT: popl %ebp
-; FALLBACK27-NEXT: retl
-;
-; FALLBACK28-LABEL: lshr_16bytes:
-; FALLBACK28: # %bb.0:
-; FALLBACK28-NEXT: pushl %ebp
-; FALLBACK28-NEXT: pushl %ebx
-; FALLBACK28-NEXT: pushl %edi
-; FALLBACK28-NEXT: pushl %esi
-; FALLBACK28-NEXT: subl $60, %esp
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK28-NEXT: movzbl (%eax), %ecx
-; FALLBACK28-NEXT: movl %ecx, %eax
-; FALLBACK28-NEXT: shlb $3, %al
-; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: andb $12, %cl
-; FALLBACK28-NEXT: movzbl %cl, %edi
-; FALLBACK28-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK28-NEXT: movl 20(%esp,%edi), %esi
-; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: movl %eax, %edx
-; FALLBACK28-NEXT: notb %dl
-; FALLBACK28-NEXT: addl %esi, %esi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: orl %ebx, %esi
-; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %esi
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %esi
-; FALLBACK28-NEXT: movl 28(%esp,%edi), %edi
-; FALLBACK28-NEXT: leal (%edi,%edi), %ebp
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebp
-; FALLBACK28-NEXT: orl %esi, %ebp
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %esi
-; FALLBACK28-NEXT: addl %ebx, %ebx
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %esi, %ebx
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: movl %edi, 12(%edx)
-; FALLBACK28-NEXT: movl %ebx, 4(%edx)
-; FALLBACK28-NEXT: movl %ebp, 8(%edx)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: movl %eax, (%edx)
-; FALLBACK28-NEXT: addl $60, %esp
-; FALLBACK28-NEXT: popl %esi
-; FALLBACK28-NEXT: popl %edi
-; FALLBACK28-NEXT: popl %ebx
-; FALLBACK28-NEXT: popl %ebp
-; FALLBACK28-NEXT: retl
-;
-; FALLBACK29-LABEL: lshr_16bytes:
-; FALLBACK29: # %bb.0:
-; FALLBACK29-NEXT: pushl %ebp
-; FALLBACK29-NEXT: pushl %ebx
-; FALLBACK29-NEXT: pushl %edi
-; FALLBACK29-NEXT: pushl %esi
-; FALLBACK29-NEXT: subl $44, %esp
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK29-NEXT: vmovups (%edx), %xmm0
-; FALLBACK29-NEXT: movzbl (%ecx), %edx
-; FALLBACK29-NEXT: movl %edx, %ecx
-; FALLBACK29-NEXT: shlb $3, %cl
-; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: vmovaps %xmm0, (%esp)
-; FALLBACK29-NEXT: andb $12, %dl
-; FALLBACK29-NEXT: movzbl %dl, %ebx
-; FALLBACK29-NEXT: movl 12(%esp,%ebx), %edx
-; FALLBACK29-NEXT: movl 8(%esp,%ebx), %ebp
-; FALLBACK29-NEXT: movl %ebp, %edi
-; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK29-NEXT: movl (%esp,%ebx), %esi
-; FALLBACK29-NEXT: movl 4(%esp,%ebx), %eax
-; FALLBACK29-NEXT: movl %eax, %ebx
-; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK29-NEXT: movl %ebx, 4(%ebp)
-; FALLBACK29-NEXT: movl %edi, 8(%ebp)
-; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK29-NEXT: shrl %cl, %edx
-; FALLBACK29-NEXT: movl %edx, 12(%ebp)
-; FALLBACK29-NEXT: movl %esi, (%ebp)
-; FALLBACK29-NEXT: addl $44, %esp
-; FALLBACK29-NEXT: popl %esi
-; FALLBACK29-NEXT: popl %edi
-; FALLBACK29-NEXT: popl %ebx
-; FALLBACK29-NEXT: popl %ebp
-; FALLBACK29-NEXT: retl
-;
-; FALLBACK30-LABEL: lshr_16bytes:
-; FALLBACK30: # %bb.0:
-; FALLBACK30-NEXT: pushl %ebp
-; FALLBACK30-NEXT: pushl %ebx
-; FALLBACK30-NEXT: pushl %edi
-; FALLBACK30-NEXT: pushl %esi
-; FALLBACK30-NEXT: subl $44, %esp
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK30-NEXT: movzbl (%eax), %ecx
-; FALLBACK30-NEXT: movl %ecx, %eax
-; FALLBACK30-NEXT: shlb $3, %al
-; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: vmovaps %xmm0, (%esp)
-; FALLBACK30-NEXT: andb $12, %cl
-; FALLBACK30-NEXT: movzbl %cl, %edi
-; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx
-; FALLBACK30-NEXT: movl %eax, %ecx
-; FALLBACK30-NEXT: notb %cl
-; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp
-; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi
-; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx
-; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx
-; FALLBACK30-NEXT: orl %ebx, %edx
-; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx
-; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp
-; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi
-; FALLBACK30-NEXT: shrxl %eax, %edi, %eax
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi
-; FALLBACK30-NEXT: orl %ebx, %edi
-; FALLBACK30-NEXT: addl %esi, %esi
-; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx
-; FALLBACK30-NEXT: orl %ebp, %ecx
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK30-NEXT: movl %eax, 12(%esi)
-; FALLBACK30-NEXT: movl %ecx, 4(%esi)
-; FALLBACK30-NEXT: movl %edi, 8(%esi)
-; FALLBACK30-NEXT: movl %edx, (%esi)
-; FALLBACK30-NEXT: addl $44, %esp
-; FALLBACK30-NEXT: popl %esi
-; FALLBACK30-NEXT: popl %edi
-; FALLBACK30-NEXT: popl %ebx
-; FALLBACK30-NEXT: popl %ebp
-; FALLBACK30-NEXT: retl
-;
-; FALLBACK31-LABEL: lshr_16bytes:
-; FALLBACK31: # %bb.0:
-; FALLBACK31-NEXT: pushl %ebp
-; FALLBACK31-NEXT: pushl %ebx
-; FALLBACK31-NEXT: pushl %edi
-; FALLBACK31-NEXT: pushl %esi
-; FALLBACK31-NEXT: subl $44, %esp
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK31-NEXT: vmovups (%edx), %xmm0
-; FALLBACK31-NEXT: movzbl (%ecx), %edx
-; FALLBACK31-NEXT: movl %edx, %ecx
-; FALLBACK31-NEXT: shlb $3, %cl
-; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: vmovaps %xmm0, (%esp)
-; FALLBACK31-NEXT: andb $12, %dl
-; FALLBACK31-NEXT: movzbl %dl, %ebx
-; FALLBACK31-NEXT: movl 12(%esp,%ebx), %edx
-; FALLBACK31-NEXT: movl 8(%esp,%ebx), %ebp
-; FALLBACK31-NEXT: movl %ebp, %edi
-; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK31-NEXT: movl (%esp,%ebx), %esi
-; FALLBACK31-NEXT: movl 4(%esp,%ebx), %eax
-; FALLBACK31-NEXT: movl %eax, %ebx
-; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK31-NEXT: movl %ebx, 4(%ebp)
-; FALLBACK31-NEXT: movl %edi, 8(%ebp)
-; FALLBACK31-NEXT: shrxl %ecx, %edx, %edx
-; FALLBACK31-NEXT: movl %edx, 12(%ebp)
-; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK31-NEXT: movl %esi, (%ebp)
-; FALLBACK31-NEXT: addl $44, %esp
-; FALLBACK31-NEXT: popl %esi
-; FALLBACK31-NEXT: popl %edi
-; FALLBACK31-NEXT: popl %ebx
-; FALLBACK31-NEXT: popl %ebp
-; FALLBACK31-NEXT: retl
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_16bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %al
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ah, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_16bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%edx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ch, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%esp,%ebx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebp, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_16bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %bl, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%esp,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, (%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%esp,%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %al, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%esp,%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp,%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%esp,%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%esp,%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %eax, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_16bytes:
+; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_16bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%edx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %dl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 12(%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 8(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp,%ebx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 4(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_16bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $60, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%ebp,%ebp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %ebx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $60, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%edx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 12(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 8(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%esp,%ebx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 4(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_16bytes:
+; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $12, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 16(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 20(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 24(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 28(%esp,%edi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_16bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%edx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %dl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 12(%esp,%ebx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 8(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp,%ebx), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 4(%esp,%ebx), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_16bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $60, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, 16(%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%esp,%edi), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%ebp,%ebp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, %ebx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %eax, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $60, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%edx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 12(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 8(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%esp,%ebx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 4(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
%bitOff = shl i128 %byteOff, 3
@@ -1661,791 +1473,599 @@ define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx)
; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq
;
-; FALLBACK16-LABEL: shl_16bytes:
-; FALLBACK16: # %bb.0:
-; FALLBACK16-NEXT: pushl %ebp
-; FALLBACK16-NEXT: pushl %ebx
-; FALLBACK16-NEXT: pushl %edi
-; FALLBACK16-NEXT: pushl %esi
-; FALLBACK16-NEXT: subl $60, %esp
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT: movl (%ecx), %ebx
-; FALLBACK16-NEXT: movl 4(%ecx), %esi
-; FALLBACK16-NEXT: movl 8(%ecx), %edi
-; FALLBACK16-NEXT: movl 12(%ecx), %ecx
-; FALLBACK16-NEXT: movb (%eax), %ah
-; FALLBACK16-NEXT: movb %ah, %dh
-; FALLBACK16-NEXT: shlb $3, %dh
-; FALLBACK16-NEXT: xorps %xmm0, %xmm0
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: andb $12, %ah
-; FALLBACK16-NEXT: negb %ah
-; FALLBACK16-NEXT: movsbl %ah, %ebp
-; FALLBACK16-NEXT: movl 32(%esp,%ebp), %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 36(%esp,%ebp), %esi
-; FALLBACK16-NEXT: movl %esi, %edi
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: movb %dh, %dl
-; FALLBACK16-NEXT: notb %dl
-; FALLBACK16-NEXT: shrl %ebx
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: orl %edi, %ebx
-; FALLBACK16-NEXT: movl 44(%esp,%ebp), %eax
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: movl 40(%esp,%ebp), %edi
-; FALLBACK16-NEXT: movl %edi, %ebp
-; FALLBACK16-NEXT: shrl %ebp
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: orl %eax, %ebp
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: shrl %esi
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shrl %cl, %esi
-; FALLBACK16-NEXT: orl %edi, %esi
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: movl %edx, (%eax)
-; FALLBACK16-NEXT: movl %esi, 8(%eax)
-; FALLBACK16-NEXT: movl %ebp, 12(%eax)
-; FALLBACK16-NEXT: movl %ebx, 4(%eax)
-; FALLBACK16-NEXT: addl $60, %esp
-; FALLBACK16-NEXT: popl %esi
-; FALLBACK16-NEXT: popl %edi
-; FALLBACK16-NEXT: popl %ebx
-; FALLBACK16-NEXT: popl %ebp
-; FALLBACK16-NEXT: retl
-;
-; FALLBACK17-LABEL: shl_16bytes:
-; FALLBACK17: # %bb.0:
-; FALLBACK17-NEXT: pushl %ebx
-; FALLBACK17-NEXT: pushl %edi
-; FALLBACK17-NEXT: pushl %esi
-; FALLBACK17-NEXT: subl $32, %esp
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK17-NEXT: movl (%edx), %esi
-; FALLBACK17-NEXT: movl 4(%edx), %edi
-; FALLBACK17-NEXT: movl 8(%edx), %ebx
-; FALLBACK17-NEXT: movl 12(%edx), %edx
-; FALLBACK17-NEXT: movb (%ecx), %ch
-; FALLBACK17-NEXT: movb %ch, %cl
-; FALLBACK17-NEXT: shlb $3, %cl
-; FALLBACK17-NEXT: xorps %xmm0, %xmm0
-; FALLBACK17-NEXT: movaps %xmm0, (%esp)
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: andb $12, %ch
-; FALLBACK17-NEXT: negb %ch
-; FALLBACK17-NEXT: movsbl %ch, %edi
-; FALLBACK17-NEXT: movl 24(%esp,%edi), %esi
-; FALLBACK17-NEXT: movl 28(%esp,%edi), %edx
-; FALLBACK17-NEXT: shldl %cl, %esi, %edx
-; FALLBACK17-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK17-NEXT: movl 20(%esp,%edi), %edi
-; FALLBACK17-NEXT: shldl %cl, %edi, %esi
-; FALLBACK17-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK17-NEXT: shll %cl, %ebx
-; FALLBACK17-NEXT: movl %esi, 8(%eax)
-; FALLBACK17-NEXT: movl %edx, 12(%eax)
-; FALLBACK17-NEXT: movl %ebx, (%eax)
-; FALLBACK17-NEXT: movl %edi, 4(%eax)
-; FALLBACK17-NEXT: addl $32, %esp
-; FALLBACK17-NEXT: popl %esi
-; FALLBACK17-NEXT: popl %edi
-; FALLBACK17-NEXT: popl %ebx
-; FALLBACK17-NEXT: retl
-;
-; FALLBACK18-LABEL: shl_16bytes:
-; FALLBACK18: # %bb.0:
-; FALLBACK18-NEXT: pushl %ebp
-; FALLBACK18-NEXT: pushl %ebx
-; FALLBACK18-NEXT: pushl %edi
-; FALLBACK18-NEXT: pushl %esi
-; FALLBACK18-NEXT: subl $44, %esp
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK18-NEXT: movl (%ecx), %edx
-; FALLBACK18-NEXT: movl 4(%ecx), %esi
-; FALLBACK18-NEXT: movl 8(%ecx), %edi
-; FALLBACK18-NEXT: movl 12(%ecx), %ecx
-; FALLBACK18-NEXT: movzbl (%eax), %eax
-; FALLBACK18-NEXT: movl %eax, %ebx
-; FALLBACK18-NEXT: shlb $3, %bl
-; FALLBACK18-NEXT: xorps %xmm0, %xmm0
-; FALLBACK18-NEXT: movaps %xmm0, (%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: andb $12, %al
-; FALLBACK18-NEXT: negb %al
-; FALLBACK18-NEXT: movsbl %al, %edx
-; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi
-; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx
-; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp
-; FALLBACK18-NEXT: movl %ebx, %eax
-; FALLBACK18-NEXT: notb %al
-; FALLBACK18-NEXT: shrl %edi
-; FALLBACK18-NEXT: shrxl %eax, %edi, %edi
-; FALLBACK18-NEXT: orl %esi, %edi
-; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi
-; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx
-; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx
-; FALLBACK18-NEXT: shrl %edx
-; FALLBACK18-NEXT: shrxl %eax, %edx, %edx
-; FALLBACK18-NEXT: orl %esi, %edx
-; FALLBACK18-NEXT: shrl %ecx
-; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax
-; FALLBACK18-NEXT: orl %ebx, %eax
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK18-NEXT: movl %ebp, (%ecx)
-; FALLBACK18-NEXT: movl %eax, 8(%ecx)
-; FALLBACK18-NEXT: movl %edx, 12(%ecx)
-; FALLBACK18-NEXT: movl %edi, 4(%ecx)
-; FALLBACK18-NEXT: addl $44, %esp
-; FALLBACK18-NEXT: popl %esi
-; FALLBACK18-NEXT: popl %edi
-; FALLBACK18-NEXT: popl %ebx
-; FALLBACK18-NEXT: popl %ebp
-; FALLBACK18-NEXT: retl
-;
-; FALLBACK19-LABEL: shl_16bytes:
-; FALLBACK19: # %bb.0:
-; FALLBACK19-NEXT: pushl %ebp
-; FALLBACK19-NEXT: pushl %ebx
-; FALLBACK19-NEXT: pushl %edi
-; FALLBACK19-NEXT: pushl %esi
-; FALLBACK19-NEXT: subl $44, %esp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK19-NEXT: movl (%edx), %esi
-; FALLBACK19-NEXT: movl 4(%edx), %edi
-; FALLBACK19-NEXT: movl 8(%edx), %ebx
-; FALLBACK19-NEXT: movl 12(%edx), %edx
-; FALLBACK19-NEXT: movzbl (%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, %ecx
-; FALLBACK19-NEXT: shlb $3, %cl
-; FALLBACK19-NEXT: xorps %xmm0, %xmm0
-; FALLBACK19-NEXT: movaps %xmm0, (%esp)
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: andb $12, %al
-; FALLBACK19-NEXT: negb %al
-; FALLBACK19-NEXT: movsbl %al, %eax
-; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi
-; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx
-; FALLBACK19-NEXT: shldl %cl, %esi, %edx
-; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi
-; FALLBACK19-NEXT: movl 20(%esp,%eax), %eax
-; FALLBACK19-NEXT: shldl %cl, %eax, %esi
-; FALLBACK19-NEXT: shldl %cl, %edi, %eax
-; FALLBACK19-NEXT: shlxl %ecx, %edi, %ecx
-; FALLBACK19-NEXT: movl %esi, 8(%ebp)
-; FALLBACK19-NEXT: movl %edx, 12(%ebp)
-; FALLBACK19-NEXT: movl %ecx, (%ebp)
-; FALLBACK19-NEXT: movl %eax, 4(%ebp)
-; FALLBACK19-NEXT: addl $44, %esp
-; FALLBACK19-NEXT: popl %esi
-; FALLBACK19-NEXT: popl %edi
-; FALLBACK19-NEXT: popl %ebx
-; FALLBACK19-NEXT: popl %ebp
-; FALLBACK19-NEXT: retl
-;
-; FALLBACK20-LABEL: shl_16bytes:
-; FALLBACK20: # %bb.0:
-; FALLBACK20-NEXT: pushl %ebp
-; FALLBACK20-NEXT: pushl %ebx
-; FALLBACK20-NEXT: pushl %edi
-; FALLBACK20-NEXT: pushl %esi
-; FALLBACK20-NEXT: subl $60, %esp
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: movups (%ecx), %xmm0
-; FALLBACK20-NEXT: movzbl (%eax), %ecx
-; FALLBACK20-NEXT: movl %ecx, %eax
-; FALLBACK20-NEXT: shlb $3, %al
-; FALLBACK20-NEXT: xorps %xmm1, %xmm1
-; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: andb $12, %cl
-; FALLBACK20-NEXT: negb %cl
-; FALLBACK20-NEXT: movsbl %cl, %edi
-; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: movl %eax, %edx
-; FALLBACK20-NEXT: notb %dl
-; FALLBACK20-NEXT: movl 40(%esp,%edi), %ebp
-; FALLBACK20-NEXT: movl %ebp, %esi
-; FALLBACK20-NEXT: shrl %esi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %esi
-; FALLBACK20-NEXT: orl %ebx, %esi
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebp
-; FALLBACK20-NEXT: movl 32(%esp,%edi), %ecx
-; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %edi
-; FALLBACK20-NEXT: shrl %edi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: orl %ebp, %edi
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK20-NEXT: shrl %ebp
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: orl %ebx, %ebp
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: shll %cl, %eax
-; FALLBACK20-NEXT: movl %eax, (%edx)
-; FALLBACK20-NEXT: movl %ebp, 4(%edx)
-; FALLBACK20-NEXT: movl %edi, 8(%edx)
-; FALLBACK20-NEXT: movl %esi, 12(%edx)
-; FALLBACK20-NEXT: addl $60, %esp
-; FALLBACK20-NEXT: popl %esi
-; FALLBACK20-NEXT: popl %edi
-; FALLBACK20-NEXT: popl %ebx
-; FALLBACK20-NEXT: popl %ebp
-; FALLBACK20-NEXT: retl
-;
-; FALLBACK21-LABEL: shl_16bytes:
-; FALLBACK21: # %bb.0:
-; FALLBACK21-NEXT: pushl %ebp
-; FALLBACK21-NEXT: pushl %ebx
-; FALLBACK21-NEXT: pushl %edi
-; FALLBACK21-NEXT: pushl %esi
-; FALLBACK21-NEXT: subl $44, %esp
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK21-NEXT: movups (%edx), %xmm0
-; FALLBACK21-NEXT: movzbl (%ecx), %edx
-; FALLBACK21-NEXT: movl %edx, %ecx
-; FALLBACK21-NEXT: shlb $3, %cl
-; FALLBACK21-NEXT: xorps %xmm1, %xmm1
-; FALLBACK21-NEXT: movaps %xmm1, (%esp)
-; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: andb $12, %dl
-; FALLBACK21-NEXT: negb %dl
-; FALLBACK21-NEXT: movsbl %dl, %edi
-; FALLBACK21-NEXT: movl 24(%esp,%edi), %esi
-; FALLBACK21-NEXT: movl 28(%esp,%edi), %edx
-; FALLBACK21-NEXT: shldl %cl, %esi, %edx
-; FALLBACK21-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK21-NEXT: movl 20(%esp,%edi), %edi
-; FALLBACK21-NEXT: shldl %cl, %edi, %esi
-; FALLBACK21-NEXT: movl %ebx, %ebp
-; FALLBACK21-NEXT: shll %cl, %ebp
-; FALLBACK21-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK21-NEXT: movl %edi, 4(%eax)
-; FALLBACK21-NEXT: movl %esi, 8(%eax)
-; FALLBACK21-NEXT: movl %edx, 12(%eax)
-; FALLBACK21-NEXT: movl %ebp, (%eax)
-; FALLBACK21-NEXT: addl $44, %esp
-; FALLBACK21-NEXT: popl %esi
-; FALLBACK21-NEXT: popl %edi
-; FALLBACK21-NEXT: popl %ebx
-; FALLBACK21-NEXT: popl %ebp
-; FALLBACK21-NEXT: retl
-;
-; FALLBACK22-LABEL: shl_16bytes:
-; FALLBACK22: # %bb.0:
-; FALLBACK22-NEXT: pushl %ebp
-; FALLBACK22-NEXT: pushl %ebx
-; FALLBACK22-NEXT: pushl %edi
-; FALLBACK22-NEXT: pushl %esi
-; FALLBACK22-NEXT: subl $44, %esp
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK22-NEXT: movups (%ecx), %xmm0
-; FALLBACK22-NEXT: movzbl (%eax), %ecx
-; FALLBACK22-NEXT: movl %ecx, %eax
-; FALLBACK22-NEXT: shlb $3, %al
-; FALLBACK22-NEXT: xorps %xmm1, %xmm1
-; FALLBACK22-NEXT: movaps %xmm1, (%esp)
-; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: andb $12, %cl
-; FALLBACK22-NEXT: negb %cl
-; FALLBACK22-NEXT: movsbl %cl, %ecx
-; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
-; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx
-; FALLBACK22-NEXT: shlxl %eax, %edx, %edi
-; FALLBACK22-NEXT: movl %eax, %ebx
-; FALLBACK22-NEXT: notb %bl
-; FALLBACK22-NEXT: shrl %edx
-; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx
-; FALLBACK22-NEXT: orl %esi, %edx
-; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi
-; FALLBACK22-NEXT: movl %esi, %ebp
-; FALLBACK22-NEXT: shrl %ebp
-; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp
-; FALLBACK22-NEXT: orl %edi, %ebp
-; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
-; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx
-; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax
-; FALLBACK22-NEXT: shrl %ecx
-; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT: orl %esi, %ecx
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK22-NEXT: movl %eax, (%esi)
-; FALLBACK22-NEXT: movl %ecx, 4(%esi)
-; FALLBACK22-NEXT: movl %ebp, 8(%esi)
-; FALLBACK22-NEXT: movl %edx, 12(%esi)
-; FALLBACK22-NEXT: addl $44, %esp
-; FALLBACK22-NEXT: popl %esi
-; FALLBACK22-NEXT: popl %edi
-; FALLBACK22-NEXT: popl %ebx
-; FALLBACK22-NEXT: popl %ebp
-; FALLBACK22-NEXT: retl
-;
-; FALLBACK23-LABEL: shl_16bytes:
-; FALLBACK23: # %bb.0:
-; FALLBACK23-NEXT: pushl %ebp
-; FALLBACK23-NEXT: pushl %ebx
-; FALLBACK23-NEXT: pushl %edi
-; FALLBACK23-NEXT: pushl %esi
-; FALLBACK23-NEXT: subl $44, %esp
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK23-NEXT: movups (%edx), %xmm0
-; FALLBACK23-NEXT: movzbl (%ecx), %edx
-; FALLBACK23-NEXT: movl %edx, %ecx
-; FALLBACK23-NEXT: shlb $3, %cl
-; FALLBACK23-NEXT: xorps %xmm1, %xmm1
-; FALLBACK23-NEXT: movaps %xmm1, (%esp)
-; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: andb $12, %dl
-; FALLBACK23-NEXT: negb %dl
-; FALLBACK23-NEXT: movsbl %dl, %edi
-; FALLBACK23-NEXT: movl 24(%esp,%edi), %esi
-; FALLBACK23-NEXT: movl 28(%esp,%edi), %edx
-; FALLBACK23-NEXT: shldl %cl, %esi, %edx
-; FALLBACK23-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK23-NEXT: movl 20(%esp,%edi), %edi
-; FALLBACK23-NEXT: shldl %cl, %edi, %esi
-; FALLBACK23-NEXT: shlxl %ecx, %ebx, %ebp
-; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK23-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK23-NEXT: movl %edi, 4(%eax)
-; FALLBACK23-NEXT: movl %esi, 8(%eax)
-; FALLBACK23-NEXT: movl %edx, 12(%eax)
-; FALLBACK23-NEXT: movl %ebp, (%eax)
-; FALLBACK23-NEXT: addl $44, %esp
-; FALLBACK23-NEXT: popl %esi
-; FALLBACK23-NEXT: popl %edi
-; FALLBACK23-NEXT: popl %ebx
-; FALLBACK23-NEXT: popl %ebp
-; FALLBACK23-NEXT: retl
-;
-; FALLBACK24-LABEL: shl_16bytes:
-; FALLBACK24: # %bb.0:
-; FALLBACK24-NEXT: pushl %ebp
-; FALLBACK24-NEXT: pushl %ebx
-; FALLBACK24-NEXT: pushl %edi
-; FALLBACK24-NEXT: pushl %esi
-; FALLBACK24-NEXT: subl $60, %esp
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK24-NEXT: movzbl (%eax), %ecx
-; FALLBACK24-NEXT: movl %ecx, %eax
-; FALLBACK24-NEXT: shlb $3, %al
-; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: andb $12, %cl
-; FALLBACK24-NEXT: negb %cl
-; FALLBACK24-NEXT: movsbl %cl, %edi
-; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: movl %eax, %edx
-; FALLBACK24-NEXT: notb %dl
-; FALLBACK24-NEXT: movl 40(%esp,%edi), %ebp
-; FALLBACK24-NEXT: movl %ebp, %esi
-; FALLBACK24-NEXT: shrl %esi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %esi
-; FALLBACK24-NEXT: orl %ebx, %esi
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebp
-; FALLBACK24-NEXT: movl 32(%esp,%edi), %ecx
-; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %edi
-; FALLBACK24-NEXT: shrl %edi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: orl %ebp, %edi
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK24-NEXT: shrl %ebp
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: orl %ebx, %ebp
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: shll %cl, %eax
-; FALLBACK24-NEXT: movl %eax, (%edx)
-; FALLBACK24-NEXT: movl %ebp, 4(%edx)
-; FALLBACK24-NEXT: movl %edi, 8(%edx)
-; FALLBACK24-NEXT: movl %esi, 12(%edx)
-; FALLBACK24-NEXT: addl $60, %esp
-; FALLBACK24-NEXT: popl %esi
-; FALLBACK24-NEXT: popl %edi
-; FALLBACK24-NEXT: popl %ebx
-; FALLBACK24-NEXT: popl %ebp
-; FALLBACK24-NEXT: retl
-;
-; FALLBACK25-LABEL: shl_16bytes:
-; FALLBACK25: # %bb.0:
-; FALLBACK25-NEXT: pushl %ebp
-; FALLBACK25-NEXT: pushl %ebx
-; FALLBACK25-NEXT: pushl %edi
-; FALLBACK25-NEXT: pushl %esi
-; FALLBACK25-NEXT: subl $44, %esp
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK25-NEXT: vmovups (%edx), %xmm0
-; FALLBACK25-NEXT: movzbl (%ecx), %edx
-; FALLBACK25-NEXT: movl %edx, %ecx
-; FALLBACK25-NEXT: shlb $3, %cl
-; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK25-NEXT: vmovaps %xmm1, (%esp)
-; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: andb $12, %dl
-; FALLBACK25-NEXT: negb %dl
-; FALLBACK25-NEXT: movsbl %dl, %edi
-; FALLBACK25-NEXT: movl 24(%esp,%edi), %esi
-; FALLBACK25-NEXT: movl 28(%esp,%edi), %edx
-; FALLBACK25-NEXT: shldl %cl, %esi, %edx
-; FALLBACK25-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK25-NEXT: movl 20(%esp,%edi), %edi
-; FALLBACK25-NEXT: shldl %cl, %edi, %esi
-; FALLBACK25-NEXT: movl %ebx, %ebp
-; FALLBACK25-NEXT: shll %cl, %ebp
-; FALLBACK25-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK25-NEXT: movl %edi, 4(%eax)
-; FALLBACK25-NEXT: movl %esi, 8(%eax)
-; FALLBACK25-NEXT: movl %edx, 12(%eax)
-; FALLBACK25-NEXT: movl %ebp, (%eax)
-; FALLBACK25-NEXT: addl $44, %esp
-; FALLBACK25-NEXT: popl %esi
-; FALLBACK25-NEXT: popl %edi
-; FALLBACK25-NEXT: popl %ebx
-; FALLBACK25-NEXT: popl %ebp
-; FALLBACK25-NEXT: retl
-;
-; FALLBACK26-LABEL: shl_16bytes:
-; FALLBACK26: # %bb.0:
-; FALLBACK26-NEXT: pushl %ebp
-; FALLBACK26-NEXT: pushl %ebx
-; FALLBACK26-NEXT: pushl %edi
-; FALLBACK26-NEXT: pushl %esi
-; FALLBACK26-NEXT: subl $44, %esp
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK26-NEXT: movzbl (%eax), %ecx
-; FALLBACK26-NEXT: movl %ecx, %eax
-; FALLBACK26-NEXT: shlb $3, %al
-; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK26-NEXT: vmovaps %xmm1, (%esp)
-; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: andb $12, %cl
-; FALLBACK26-NEXT: negb %cl
-; FALLBACK26-NEXT: movsbl %cl, %ecx
-; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
-; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx
-; FALLBACK26-NEXT: shlxl %eax, %edx, %edi
-; FALLBACK26-NEXT: movl %eax, %ebx
-; FALLBACK26-NEXT: notb %bl
-; FALLBACK26-NEXT: shrl %edx
-; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx
-; FALLBACK26-NEXT: orl %esi, %edx
-; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi
-; FALLBACK26-NEXT: movl %esi, %ebp
-; FALLBACK26-NEXT: shrl %ebp
-; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp
-; FALLBACK26-NEXT: orl %edi, %ebp
-; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
-; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx
-; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax
-; FALLBACK26-NEXT: shrl %ecx
-; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT: orl %esi, %ecx
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK26-NEXT: movl %eax, (%esi)
-; FALLBACK26-NEXT: movl %ecx, 4(%esi)
-; FALLBACK26-NEXT: movl %ebp, 8(%esi)
-; FALLBACK26-NEXT: movl %edx, 12(%esi)
-; FALLBACK26-NEXT: addl $44, %esp
-; FALLBACK26-NEXT: popl %esi
-; FALLBACK26-NEXT: popl %edi
-; FALLBACK26-NEXT: popl %ebx
-; FALLBACK26-NEXT: popl %ebp
-; FALLBACK26-NEXT: retl
-;
-; FALLBACK27-LABEL: shl_16bytes:
-; FALLBACK27: # %bb.0:
-; FALLBACK27-NEXT: pushl %ebp
-; FALLBACK27-NEXT: pushl %ebx
-; FALLBACK27-NEXT: pushl %edi
-; FALLBACK27-NEXT: pushl %esi
-; FALLBACK27-NEXT: subl $44, %esp
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK27-NEXT: vmovups (%edx), %xmm0
-; FALLBACK27-NEXT: movzbl (%ecx), %edx
-; FALLBACK27-NEXT: movl %edx, %ecx
-; FALLBACK27-NEXT: shlb $3, %cl
-; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK27-NEXT: vmovaps %xmm1, (%esp)
-; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: andb $12, %dl
-; FALLBACK27-NEXT: negb %dl
-; FALLBACK27-NEXT: movsbl %dl, %edi
-; FALLBACK27-NEXT: movl 24(%esp,%edi), %esi
-; FALLBACK27-NEXT: movl 28(%esp,%edi), %edx
-; FALLBACK27-NEXT: shldl %cl, %esi, %edx
-; FALLBACK27-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK27-NEXT: movl 20(%esp,%edi), %edi
-; FALLBACK27-NEXT: shldl %cl, %edi, %esi
-; FALLBACK27-NEXT: shlxl %ecx, %ebx, %ebp
-; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK27-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK27-NEXT: movl %edi, 4(%eax)
-; FALLBACK27-NEXT: movl %esi, 8(%eax)
-; FALLBACK27-NEXT: movl %edx, 12(%eax)
-; FALLBACK27-NEXT: movl %ebp, (%eax)
-; FALLBACK27-NEXT: addl $44, %esp
-; FALLBACK27-NEXT: popl %esi
-; FALLBACK27-NEXT: popl %edi
-; FALLBACK27-NEXT: popl %ebx
-; FALLBACK27-NEXT: popl %ebp
-; FALLBACK27-NEXT: retl
-;
-; FALLBACK28-LABEL: shl_16bytes:
-; FALLBACK28: # %bb.0:
-; FALLBACK28-NEXT: pushl %ebp
-; FALLBACK28-NEXT: pushl %ebx
-; FALLBACK28-NEXT: pushl %edi
-; FALLBACK28-NEXT: pushl %esi
-; FALLBACK28-NEXT: subl $60, %esp
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK28-NEXT: movzbl (%eax), %ecx
-; FALLBACK28-NEXT: movl %ecx, %eax
-; FALLBACK28-NEXT: shlb $3, %al
-; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: andb $12, %cl
-; FALLBACK28-NEXT: negb %cl
-; FALLBACK28-NEXT: movsbl %cl, %edi
-; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: movl %eax, %edx
-; FALLBACK28-NEXT: notb %dl
-; FALLBACK28-NEXT: movl 40(%esp,%edi), %ebp
-; FALLBACK28-NEXT: movl %ebp, %esi
-; FALLBACK28-NEXT: shrl %esi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %esi
-; FALLBACK28-NEXT: orl %ebx, %esi
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebp
-; FALLBACK28-NEXT: movl 32(%esp,%edi), %ecx
-; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %edi
-; FALLBACK28-NEXT: shrl %edi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: orl %ebp, %edi
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK28-NEXT: shrl %ebp
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: orl %ebx, %ebp
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: shll %cl, %eax
-; FALLBACK28-NEXT: movl %eax, (%edx)
-; FALLBACK28-NEXT: movl %ebp, 4(%edx)
-; FALLBACK28-NEXT: movl %edi, 8(%edx)
-; FALLBACK28-NEXT: movl %esi, 12(%edx)
-; FALLBACK28-NEXT: addl $60, %esp
-; FALLBACK28-NEXT: popl %esi
-; FALLBACK28-NEXT: popl %edi
-; FALLBACK28-NEXT: popl %ebx
-; FALLBACK28-NEXT: popl %ebp
-; FALLBACK28-NEXT: retl
-;
-; FALLBACK29-LABEL: shl_16bytes:
-; FALLBACK29: # %bb.0:
-; FALLBACK29-NEXT: pushl %ebp
-; FALLBACK29-NEXT: pushl %ebx
-; FALLBACK29-NEXT: pushl %edi
-; FALLBACK29-NEXT: pushl %esi
-; FALLBACK29-NEXT: subl $44, %esp
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK29-NEXT: vmovups (%edx), %xmm0
-; FALLBACK29-NEXT: movzbl (%ecx), %edx
-; FALLBACK29-NEXT: movl %edx, %ecx
-; FALLBACK29-NEXT: shlb $3, %cl
-; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK29-NEXT: vmovaps %xmm1, (%esp)
-; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: andb $12, %dl
-; FALLBACK29-NEXT: negb %dl
-; FALLBACK29-NEXT: movsbl %dl, %edi
-; FALLBACK29-NEXT: movl 24(%esp,%edi), %esi
-; FALLBACK29-NEXT: movl 28(%esp,%edi), %edx
-; FALLBACK29-NEXT: shldl %cl, %esi, %edx
-; FALLBACK29-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK29-NEXT: movl 20(%esp,%edi), %edi
-; FALLBACK29-NEXT: shldl %cl, %edi, %esi
-; FALLBACK29-NEXT: movl %ebx, %ebp
-; FALLBACK29-NEXT: shll %cl, %ebp
-; FALLBACK29-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK29-NEXT: movl %edi, 4(%eax)
-; FALLBACK29-NEXT: movl %esi, 8(%eax)
-; FALLBACK29-NEXT: movl %edx, 12(%eax)
-; FALLBACK29-NEXT: movl %ebp, (%eax)
-; FALLBACK29-NEXT: addl $44, %esp
-; FALLBACK29-NEXT: popl %esi
-; FALLBACK29-NEXT: popl %edi
-; FALLBACK29-NEXT: popl %ebx
-; FALLBACK29-NEXT: popl %ebp
-; FALLBACK29-NEXT: retl
-;
-; FALLBACK30-LABEL: shl_16bytes:
-; FALLBACK30: # %bb.0:
-; FALLBACK30-NEXT: pushl %ebp
-; FALLBACK30-NEXT: pushl %ebx
-; FALLBACK30-NEXT: pushl %edi
-; FALLBACK30-NEXT: pushl %esi
-; FALLBACK30-NEXT: subl $44, %esp
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK30-NEXT: movzbl (%eax), %ecx
-; FALLBACK30-NEXT: movl %ecx, %eax
-; FALLBACK30-NEXT: shlb $3, %al
-; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK30-NEXT: vmovaps %xmm1, (%esp)
-; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: andb $12, %cl
-; FALLBACK30-NEXT: negb %cl
-; FALLBACK30-NEXT: movsbl %cl, %ecx
-; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi
-; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx
-; FALLBACK30-NEXT: shlxl %eax, %edx, %edi
-; FALLBACK30-NEXT: movl %eax, %ebx
-; FALLBACK30-NEXT: notb %bl
-; FALLBACK30-NEXT: shrl %edx
-; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx
-; FALLBACK30-NEXT: orl %esi, %edx
-; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi
-; FALLBACK30-NEXT: movl %esi, %ebp
-; FALLBACK30-NEXT: shrl %ebp
-; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp
-; FALLBACK30-NEXT: orl %edi, %ebp
-; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
-; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx
-; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax
-; FALLBACK30-NEXT: shrl %ecx
-; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT: orl %esi, %ecx
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK30-NEXT: movl %eax, (%esi)
-; FALLBACK30-NEXT: movl %ecx, 4(%esi)
-; FALLBACK30-NEXT: movl %ebp, 8(%esi)
-; FALLBACK30-NEXT: movl %edx, 12(%esi)
-; FALLBACK30-NEXT: addl $44, %esp
-; FALLBACK30-NEXT: popl %esi
-; FALLBACK30-NEXT: popl %edi
-; FALLBACK30-NEXT: popl %ebx
-; FALLBACK30-NEXT: popl %ebp
-; FALLBACK30-NEXT: retl
-;
-; FALLBACK31-LABEL: shl_16bytes:
-; FALLBACK31: # %bb.0:
-; FALLBACK31-NEXT: pushl %ebp
-; FALLBACK31-NEXT: pushl %ebx
-; FALLBACK31-NEXT: pushl %edi
-; FALLBACK31-NEXT: pushl %esi
-; FALLBACK31-NEXT: subl $44, %esp
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK31-NEXT: vmovups (%edx), %xmm0
-; FALLBACK31-NEXT: movzbl (%ecx), %edx
-; FALLBACK31-NEXT: movl %edx, %ecx
-; FALLBACK31-NEXT: shlb $3, %cl
-; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK31-NEXT: vmovaps %xmm1, (%esp)
-; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: andb $12, %dl
-; FALLBACK31-NEXT: negb %dl
-; FALLBACK31-NEXT: movsbl %dl, %edi
-; FALLBACK31-NEXT: movl 24(%esp,%edi), %esi
-; FALLBACK31-NEXT: movl 28(%esp,%edi), %edx
-; FALLBACK31-NEXT: shldl %cl, %esi, %edx
-; FALLBACK31-NEXT: movl 16(%esp,%edi), %ebx
-; FALLBACK31-NEXT: movl 20(%esp,%edi), %edi
-; FALLBACK31-NEXT: shldl %cl, %edi, %esi
-; FALLBACK31-NEXT: shlxl %ecx, %ebx, %ebp
-; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK31-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK31-NEXT: movl %edi, 4(%eax)
-; FALLBACK31-NEXT: movl %esi, 8(%eax)
-; FALLBACK31-NEXT: movl %edx, 12(%eax)
-; FALLBACK31-NEXT: movl %ebp, (%eax)
-; FALLBACK31-NEXT: addl $44, %esp
-; FALLBACK31-NEXT: popl %esi
-; FALLBACK31-NEXT: popl %edi
-; FALLBACK31-NEXT: popl %ebx
-; FALLBACK31-NEXT: popl %ebp
-; FALLBACK31-NEXT: retl
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_16bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %dh
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %dh
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ah, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_16bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $32, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%edx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $12, %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ch, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%edi), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%edi), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%edi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%edi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $32, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_16bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %bl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, 28(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, (%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 12(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 4(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%edx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%edx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%edx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%edx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $12, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %al, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esp,%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edi, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_16bytes:
+; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_16bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%edx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %dl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbl %dl, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%esp,%edi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_16bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %dl, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, 28(%esp,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%esp,%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%esp,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%esp,%edx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%edx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm1, %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %dl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %dl, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%esp,%edi), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%esp,%edi), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%esp,%edi), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%esp,%edi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX-LABEL: shl_16bytes:
+; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $60, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $12, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movsbl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 4(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 8(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 12(%edx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $60, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_16bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%edx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, (%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %dl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbl %dl, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 24(%esp,%edi), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 28(%esp,%edi), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 16(%esp,%edi), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 20(%esp,%edi), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_16bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, (%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %dl, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, 28(%esp,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%esp,%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %esi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%esp,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%esp,%edx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $44, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_16bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%edx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, (%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $12, %dl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %dl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %dl, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%esp,%edi), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%esp,%edi), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%esp,%edi), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%esp,%edi), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %ebx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $44, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl
%src = load i128, ptr %src.ptr, align 1
%byteOff = load i128, ptr %byteOff.ptr, align 1
%bitOff = shl i128 %byteOff, 3
@@ -2833,31 +2453,31 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl
-; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl
-; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx
-; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%edi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%edi), %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %al
+; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%esi,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, (%esp,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%edi), %edi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %eax, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %edi, %ecx
; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi)
-; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 12(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 8(%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, (%esi)
+; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%esi)
; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp
; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi
; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi
@@ -3108,1944 +2728,1477 @@ define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) no
}
define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; FALLBACK0-LABEL: lshr_32bytes:
-; FALLBACK0: # %bb.0:
-; FALLBACK0-NEXT: pushq %rbx
-; FALLBACK0-NEXT: movq (%rdi), %rcx
-; FALLBACK0-NEXT: movq 8(%rdi), %r8
-; FALLBACK0-NEXT: movq 16(%rdi), %r9
-; FALLBACK0-NEXT: movq 24(%rdi), %rdi
-; FALLBACK0-NEXT: movzbl (%rsi), %esi
-; FALLBACK0-NEXT: leal (,%rsi,8), %eax
-; FALLBACK0-NEXT: xorps %xmm0, %xmm0
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: andb $24, %sil
-; FALLBACK0-NEXT: movzbl %sil, %r9d
-; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10
-; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi
-; FALLBACK0-NEXT: movq %rdi, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r11
-; FALLBACK0-NEXT: movl %eax, %esi
-; FALLBACK0-NEXT: notb %sil
-; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx
-; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r8
-; FALLBACK0-NEXT: orq %r11, %r8
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r10
-; FALLBACK0-NEXT: addq %rdi, %rdi
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %rdi
-; FALLBACK0-NEXT: orq %r10, %rdi
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rbx
-; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9
-; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r10
-; FALLBACK0-NEXT: orq %rbx, %r10
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r9
-; FALLBACK0-NEXT: movq %r9, 24(%rdx)
-; FALLBACK0-NEXT: movq %r10, 16(%rdx)
-; FALLBACK0-NEXT: movq %rdi, (%rdx)
-; FALLBACK0-NEXT: movq %r8, 8(%rdx)
-; FALLBACK0-NEXT: popq %rbx
-; FALLBACK0-NEXT: retq
-;
-; FALLBACK1-LABEL: lshr_32bytes:
-; FALLBACK1: # %bb.0:
-; FALLBACK1-NEXT: movq (%rdi), %rax
-; FALLBACK1-NEXT: movq 8(%rdi), %r8
-; FALLBACK1-NEXT: movq 16(%rdi), %r9
-; FALLBACK1-NEXT: movq 24(%rdi), %rdi
-; FALLBACK1-NEXT: movzbl (%rsi), %esi
-; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK1-NEXT: xorps %xmm0, %xmm0
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: andb $24, %sil
-; FALLBACK1-NEXT: movzbl %sil, %eax
-; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi
-; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi
-; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8
-; FALLBACK1-NEXT: movq %r8, %r9
-; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
-; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax
-; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
-; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
-; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK1-NEXT: shrq %cl, %rax
-; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK1-NEXT: movq %rax, 24(%rdx)
-; FALLBACK1-NEXT: movq %rdi, (%rdx)
-; FALLBACK1-NEXT: movq %r9, 8(%rdx)
-; FALLBACK1-NEXT: retq
-;
-; FALLBACK2-LABEL: lshr_32bytes:
-; FALLBACK2: # %bb.0:
-; FALLBACK2-NEXT: movq (%rdi), %rcx
-; FALLBACK2-NEXT: movq 8(%rdi), %r8
-; FALLBACK2-NEXT: movq 16(%rdi), %r9
-; FALLBACK2-NEXT: movq 24(%rdi), %rdi
-; FALLBACK2-NEXT: movzbl (%rsi), %esi
-; FALLBACK2-NEXT: leal (,%rsi,8), %eax
-; FALLBACK2-NEXT: xorps %xmm0, %xmm0
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: andb $24, %sil
-; FALLBACK2-NEXT: movzbl %sil, %ecx
-; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi
-; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi
-; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9
-; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx
-; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11
-; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK2-NEXT: notb %al
-; FALLBACK2-NEXT: addq %rdi, %rdi
-; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT: orq %r8, %rdi
-; FALLBACK2-NEXT: addq %rsi, %rsi
-; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT: orq %r9, %rsi
-; FALLBACK2-NEXT: addq %rcx, %rcx
-; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT: orq %r10, %rax
-; FALLBACK2-NEXT: movq %r11, 24(%rdx)
-; FALLBACK2-NEXT: movq %rax, 16(%rdx)
-; FALLBACK2-NEXT: movq %rsi, (%rdx)
-; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
-; FALLBACK2-NEXT: retq
-;
-; FALLBACK3-LABEL: lshr_32bytes:
-; FALLBACK3: # %bb.0:
-; FALLBACK3-NEXT: movq (%rdi), %rax
-; FALLBACK3-NEXT: movq 8(%rdi), %r8
-; FALLBACK3-NEXT: movq 16(%rdi), %r9
-; FALLBACK3-NEXT: movq 24(%rdi), %rdi
-; FALLBACK3-NEXT: movzbl (%rsi), %esi
-; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK3-NEXT: xorps %xmm0, %xmm0
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: andb $24, %sil
-; FALLBACK3-NEXT: movzbl %sil, %eax
-; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi
-; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi
-; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8
-; FALLBACK3-NEXT: movq %r8, %r9
-; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
-; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax
-; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
-; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
-; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
-; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK3-NEXT: movq %rax, 24(%rdx)
-; FALLBACK3-NEXT: movq %rdi, (%rdx)
-; FALLBACK3-NEXT: movq %r9, 8(%rdx)
-; FALLBACK3-NEXT: retq
-;
-; FALLBACK4-LABEL: lshr_32bytes:
-; FALLBACK4: # %bb.0:
-; FALLBACK4-NEXT: pushq %rbx
-; FALLBACK4-NEXT: movups (%rdi), %xmm0
-; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK4-NEXT: movzbl (%rsi), %ecx
-; FALLBACK4-NEXT: leal (,%rcx,8), %eax
-; FALLBACK4-NEXT: xorps %xmm2, %xmm2
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: andb $24, %cl
-; FALLBACK4-NEXT: movzbl %cl, %r9d
-; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10
-; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r10
-; FALLBACK4-NEXT: movl %eax, %esi
-; FALLBACK4-NEXT: notb %sil
-; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rdi
-; FALLBACK4-NEXT: orq %r10, %rdi
-; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10
-; FALLBACK4-NEXT: movq %r10, %r11
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r11
-; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9
-; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rbx
-; FALLBACK4-NEXT: orq %r11, %rbx
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r8
-; FALLBACK4-NEXT: addq %r10, %r10
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r10
-; FALLBACK4-NEXT: orq %r8, %r10
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r9
-; FALLBACK4-NEXT: movq %r9, 24(%rdx)
-; FALLBACK4-NEXT: movq %r10, 8(%rdx)
-; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK4-NEXT: movq %rdi, (%rdx)
-; FALLBACK4-NEXT: popq %rbx
-; FALLBACK4-NEXT: retq
-;
-; FALLBACK5-LABEL: lshr_32bytes:
-; FALLBACK5: # %bb.0:
-; FALLBACK5-NEXT: movups (%rdi), %xmm0
-; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK5-NEXT: movzbl (%rsi), %eax
-; FALLBACK5-NEXT: leal (,%rax,8), %ecx
-; FALLBACK5-NEXT: xorps %xmm2, %xmm2
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: andb $24, %al
-; FALLBACK5-NEXT: movzbl %al, %eax
-; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK5-NEXT: movq %rdi, %r8
-; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK5-NEXT: movq %rax, %r10
-; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK5-NEXT: shrq %cl, %rsi
-; FALLBACK5-NEXT: movq %r10, 8(%rdx)
-; FALLBACK5-NEXT: movq %r8, 16(%rdx)
-; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK5-NEXT: movq %r9, (%rdx)
-; FALLBACK5-NEXT: retq
-;
-; FALLBACK6-LABEL: lshr_32bytes:
-; FALLBACK6: # %bb.0:
-; FALLBACK6-NEXT: movups (%rdi), %xmm0
-; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT: movzbl (%rsi), %ecx
-; FALLBACK6-NEXT: leal (,%rcx,8), %eax
-; FALLBACK6-NEXT: xorps %xmm2, %xmm2
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: andb $24, %cl
-; FALLBACK6-NEXT: movzbl %cl, %ecx
-; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8
-; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11
-; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK6-NEXT: notb %al
-; FALLBACK6-NEXT: addq %rdi, %rdi
-; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT: orq %rsi, %rdi
-; FALLBACK6-NEXT: addq %rcx, %rcx
-; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT: orq %r9, %rcx
-; FALLBACK6-NEXT: addq %r8, %r8
-; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT: orq %r10, %rax
-; FALLBACK6-NEXT: movq %r11, 24(%rdx)
-; FALLBACK6-NEXT: movq %rax, 8(%rdx)
-; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT: movq %rdi, (%rdx)
-; FALLBACK6-NEXT: retq
-;
-; FALLBACK7-LABEL: lshr_32bytes:
-; FALLBACK7: # %bb.0:
-; FALLBACK7-NEXT: movups (%rdi), %xmm0
-; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK7-NEXT: movzbl (%rsi), %eax
-; FALLBACK7-NEXT: leal (,%rax,8), %ecx
-; FALLBACK7-NEXT: xorps %xmm2, %xmm2
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: andb $24, %al
-; FALLBACK7-NEXT: movzbl %al, %eax
-; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK7-NEXT: movq %rdi, %r8
-; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK7-NEXT: movq %rax, %r10
-; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax
-; FALLBACK7-NEXT: movq %r10, 8(%rdx)
-; FALLBACK7-NEXT: movq %r8, 16(%rdx)
-; FALLBACK7-NEXT: movq %rax, 24(%rdx)
-; FALLBACK7-NEXT: movq %r9, (%rdx)
-; FALLBACK7-NEXT: retq
-;
-; FALLBACK8-LABEL: lshr_32bytes:
-; FALLBACK8: # %bb.0:
-; FALLBACK8-NEXT: pushq %rbx
-; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK8-NEXT: movzbl (%rsi), %ecx
-; FALLBACK8-NEXT: leal (,%rcx,8), %eax
-; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: andb $24, %cl
-; FALLBACK8-NEXT: movzbl %cl, %r9d
-; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10
-; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r10
-; FALLBACK8-NEXT: movl %eax, %esi
-; FALLBACK8-NEXT: notb %sil
-; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rdi
-; FALLBACK8-NEXT: orq %r10, %rdi
-; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10
-; FALLBACK8-NEXT: movq %r10, %r11
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r11
-; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9
-; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rbx
-; FALLBACK8-NEXT: orq %r11, %rbx
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r8
-; FALLBACK8-NEXT: addq %r10, %r10
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r10
-; FALLBACK8-NEXT: orq %r8, %r10
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r9
-; FALLBACK8-NEXT: movq %r9, 24(%rdx)
-; FALLBACK8-NEXT: movq %r10, 8(%rdx)
-; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK8-NEXT: movq %rdi, (%rdx)
-; FALLBACK8-NEXT: popq %rbx
-; FALLBACK8-NEXT: vzeroupper
-; FALLBACK8-NEXT: retq
-;
-; FALLBACK9-LABEL: lshr_32bytes:
-; FALLBACK9: # %bb.0:
-; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK9-NEXT: movzbl (%rsi), %eax
-; FALLBACK9-NEXT: leal (,%rax,8), %ecx
-; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: andb $24, %al
-; FALLBACK9-NEXT: movzbl %al, %eax
-; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK9-NEXT: movq %rdi, %r8
-; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK9-NEXT: movq %rax, %r10
-; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK9-NEXT: shrq %cl, %rsi
-; FALLBACK9-NEXT: movq %r10, 8(%rdx)
-; FALLBACK9-NEXT: movq %r8, 16(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK9-NEXT: movq %r9, (%rdx)
-; FALLBACK9-NEXT: vzeroupper
-; FALLBACK9-NEXT: retq
-;
-; FALLBACK10-LABEL: lshr_32bytes:
-; FALLBACK10: # %bb.0:
-; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT: movzbl (%rsi), %ecx
-; FALLBACK10-NEXT: leal (,%rcx,8), %eax
-; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: andb $24, %cl
-; FALLBACK10-NEXT: movzbl %cl, %ecx
-; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8
-; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11
-; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT: notb %al
-; FALLBACK10-NEXT: addq %rdi, %rdi
-; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT: orq %rsi, %rdi
-; FALLBACK10-NEXT: addq %rcx, %rcx
-; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT: orq %r9, %rcx
-; FALLBACK10-NEXT: addq %r8, %r8
-; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT: orq %r10, %rax
-; FALLBACK10-NEXT: movq %r11, 24(%rdx)
-; FALLBACK10-NEXT: movq %rax, 8(%rdx)
-; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT: movq %rdi, (%rdx)
-; FALLBACK10-NEXT: vzeroupper
-; FALLBACK10-NEXT: retq
-;
-; FALLBACK11-LABEL: lshr_32bytes:
-; FALLBACK11: # %bb.0:
-; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK11-NEXT: movzbl (%rsi), %eax
-; FALLBACK11-NEXT: leal (,%rax,8), %ecx
-; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: andb $24, %al
-; FALLBACK11-NEXT: movzbl %al, %eax
-; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK11-NEXT: movq %rdi, %r8
-; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK11-NEXT: movq %rax, %r10
-; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax
-; FALLBACK11-NEXT: movq %r10, 8(%rdx)
-; FALLBACK11-NEXT: movq %r8, 16(%rdx)
-; FALLBACK11-NEXT: movq %rax, 24(%rdx)
-; FALLBACK11-NEXT: movq %r9, (%rdx)
-; FALLBACK11-NEXT: vzeroupper
-; FALLBACK11-NEXT: retq
-;
-; FALLBACK12-LABEL: lshr_32bytes:
-; FALLBACK12: # %bb.0:
-; FALLBACK12-NEXT: pushq %rbx
-; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK12-NEXT: movzbl (%rsi), %ecx
-; FALLBACK12-NEXT: leal (,%rcx,8), %eax
-; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: andb $24, %cl
-; FALLBACK12-NEXT: movzbl %cl, %r9d
-; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10
-; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: movl %eax, %esi
-; FALLBACK12-NEXT: notb %sil
-; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rdi
-; FALLBACK12-NEXT: orq %r10, %rdi
-; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10
-; FALLBACK12-NEXT: movq %r10, %r11
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r11
-; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9
-; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rbx
-; FALLBACK12-NEXT: orq %r11, %rbx
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r8
-; FALLBACK12-NEXT: addq %r10, %r10
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r10
-; FALLBACK12-NEXT: orq %r8, %r10
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r9
-; FALLBACK12-NEXT: movq %r9, 24(%rdx)
-; FALLBACK12-NEXT: movq %r10, 8(%rdx)
-; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK12-NEXT: movq %rdi, (%rdx)
-; FALLBACK12-NEXT: popq %rbx
-; FALLBACK12-NEXT: vzeroupper
-; FALLBACK12-NEXT: retq
-;
-; FALLBACK13-LABEL: lshr_32bytes:
-; FALLBACK13: # %bb.0:
-; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK13-NEXT: movzbl (%rsi), %eax
-; FALLBACK13-NEXT: leal (,%rax,8), %ecx
-; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: andb $24, %al
-; FALLBACK13-NEXT: movzbl %al, %eax
-; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK13-NEXT: movq %rdi, %r8
-; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK13-NEXT: movq %rax, %r10
-; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK13-NEXT: shrq %cl, %rsi
-; FALLBACK13-NEXT: movq %r10, 8(%rdx)
-; FALLBACK13-NEXT: movq %r8, 16(%rdx)
-; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK13-NEXT: movq %r9, (%rdx)
-; FALLBACK13-NEXT: vzeroupper
-; FALLBACK13-NEXT: retq
-;
-; FALLBACK14-LABEL: lshr_32bytes:
-; FALLBACK14: # %bb.0:
-; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT: movzbl (%rsi), %ecx
-; FALLBACK14-NEXT: leal (,%rcx,8), %eax
-; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: andb $24, %cl
-; FALLBACK14-NEXT: movzbl %cl, %ecx
-; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8
-; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11
-; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT: notb %al
-; FALLBACK14-NEXT: addq %rdi, %rdi
-; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT: orq %rsi, %rdi
-; FALLBACK14-NEXT: addq %rcx, %rcx
-; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT: orq %r9, %rcx
-; FALLBACK14-NEXT: addq %r8, %r8
-; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT: orq %r10, %rax
-; FALLBACK14-NEXT: movq %r11, 24(%rdx)
-; FALLBACK14-NEXT: movq %rax, 8(%rdx)
-; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT: movq %rdi, (%rdx)
-; FALLBACK14-NEXT: vzeroupper
-; FALLBACK14-NEXT: retq
-;
-; FALLBACK15-LABEL: lshr_32bytes:
-; FALLBACK15: # %bb.0:
-; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK15-NEXT: movzbl (%rsi), %eax
-; FALLBACK15-NEXT: leal (,%rax,8), %ecx
-; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: andb $24, %al
-; FALLBACK15-NEXT: movzbl %al, %eax
-; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK15-NEXT: movq %rdi, %r8
-; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK15-NEXT: movq %rax, %r10
-; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax
-; FALLBACK15-NEXT: movq %r10, 8(%rdx)
-; FALLBACK15-NEXT: movq %r8, 16(%rdx)
-; FALLBACK15-NEXT: movq %rax, 24(%rdx)
-; FALLBACK15-NEXT: movq %r9, (%rdx)
-; FALLBACK15-NEXT: vzeroupper
-; FALLBACK15-NEXT: retq
-;
-; FALLBACK16-LABEL: lshr_32bytes:
-; FALLBACK16: # %bb.0:
-; FALLBACK16-NEXT: pushl %ebp
-; FALLBACK16-NEXT: pushl %ebx
-; FALLBACK16-NEXT: pushl %edi
-; FALLBACK16-NEXT: pushl %esi
-; FALLBACK16-NEXT: subl $108, %esp
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK16-NEXT: movl (%ebp), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 4(%ebp), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 8(%ebp), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 12(%ebp), %edi
-; FALLBACK16-NEXT: movl 16(%ebp), %ebx
-; FALLBACK16-NEXT: movb (%eax), %ah
-; FALLBACK16-NEXT: movl 20(%ebp), %esi
-; FALLBACK16-NEXT: movl 24(%ebp), %ecx
-; FALLBACK16-NEXT: movl 28(%ebp), %ebp
-; FALLBACK16-NEXT: xorps %xmm0, %xmm0
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movb %ah, %dh
-; FALLBACK16-NEXT: shlb $3, %dh
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: andb $28, %ah
-; FALLBACK16-NEXT: movzbl %ah, %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi
-; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax
-; FALLBACK16-NEXT: movl %eax, %ebx
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: movb %dh, %dl
-; FALLBACK16-NEXT: notb %dl
-; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi
-; FALLBACK16-NEXT: leal (%edi,%edi), %ebp
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %ebx, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: shrl %cl, %esi
-; FALLBACK16-NEXT: movl %eax, %ebx
-; FALLBACK16-NEXT: addl %eax, %ebx
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %esi, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp
-; FALLBACK16-NEXT: movl %ebp, %esi
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: shrl %cl, %esi
-; FALLBACK16-NEXT: movl 48(%esp,%eax), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: leal (%eax,%eax), %ebx
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %esi, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: addl %ebp, %ebp
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %edi, %ebp
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl 52(%esp,%eax), %edi
-; FALLBACK16-NEXT: movl %edi, %ebx
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: movl 56(%esp,%eax), %esi
-; FALLBACK16-NEXT: leal (%esi,%esi), %eax
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: orl %ebx, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: addl %edi, %edi
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: orl %ebx, %edi
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: movl %esi, %eax
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl 60(%esp,%ecx), %ebx
-; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: orl %eax, %esi
-; FALLBACK16-NEXT: movb %dh, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl %ebx, 28(%eax)
-; FALLBACK16-NEXT: movl %esi, 24(%eax)
-; FALLBACK16-NEXT: movl %edi, 16(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 20(%eax)
-; FALLBACK16-NEXT: movl %ebp, 8(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 12(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, (%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 4(%eax)
-; FALLBACK16-NEXT: addl $108, %esp
-; FALLBACK16-NEXT: popl %esi
-; FALLBACK16-NEXT: popl %edi
-; FALLBACK16-NEXT: popl %ebx
-; FALLBACK16-NEXT: popl %ebp
-; FALLBACK16-NEXT: retl
-;
-; FALLBACK17-LABEL: lshr_32bytes:
-; FALLBACK17: # %bb.0:
-; FALLBACK17-NEXT: pushl %ebp
-; FALLBACK17-NEXT: pushl %ebx
-; FALLBACK17-NEXT: pushl %edi
-; FALLBACK17-NEXT: pushl %esi
-; FALLBACK17-NEXT: subl $92, %esp
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK17-NEXT: movl (%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 4(%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: movl 8(%ebp), %esi
-; FALLBACK17-NEXT: movl 12(%ebp), %edi
-; FALLBACK17-NEXT: movl 16(%ebp), %ebx
-; FALLBACK17-NEXT: movb (%ecx), %ch
-; FALLBACK17-NEXT: movl 20(%ebp), %edx
-; FALLBACK17-NEXT: movl 24(%ebp), %eax
-; FALLBACK17-NEXT: movl 28(%ebp), %ebp
-; FALLBACK17-NEXT: xorps %xmm0, %xmm0
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movb %ch, %cl
-; FALLBACK17-NEXT: shlb $3, %cl
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: andb $28, %ch
-; FALLBACK17-NEXT: movzbl %ch, %ebp
-; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx
-; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx
-; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %esi
-; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
-; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx
-; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edi
-; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi
-; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK17-NEXT: movl %edx, 24(%ebp)
-; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
-; FALLBACK17-NEXT: shrdl %cl, %edx, %esi
-; FALLBACK17-NEXT: shrl %cl, %eax
-; FALLBACK17-NEXT: movl %eax, 28(%ebp)
-; FALLBACK17-NEXT: movl %ebx, 16(%ebp)
-; FALLBACK17-NEXT: movl %edi, 20(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 8(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 12(%ebp)
-; FALLBACK17-NEXT: movl %esi, (%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 4(%ebp)
-; FALLBACK17-NEXT: addl $92, %esp
-; FALLBACK17-NEXT: popl %esi
-; FALLBACK17-NEXT: popl %edi
-; FALLBACK17-NEXT: popl %ebx
-; FALLBACK17-NEXT: popl %ebp
-; FALLBACK17-NEXT: retl
-;
-; FALLBACK18-LABEL: lshr_32bytes:
-; FALLBACK18: # %bb.0:
-; FALLBACK18-NEXT: pushl %ebp
-; FALLBACK18-NEXT: pushl %ebx
-; FALLBACK18-NEXT: pushl %edi
-; FALLBACK18-NEXT: pushl %esi
-; FALLBACK18-NEXT: subl $108, %esp
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl (%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 4(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 8(%eax), %esi
-; FALLBACK18-NEXT: movl 12(%eax), %edi
-; FALLBACK18-NEXT: movl 16(%eax), %ebp
-; FALLBACK18-NEXT: movzbl (%ebx), %ebx
-; FALLBACK18-NEXT: movl 20(%eax), %edx
-; FALLBACK18-NEXT: movl 24(%eax), %ecx
-; FALLBACK18-NEXT: movl 28(%eax), %eax
-; FALLBACK18-NEXT: xorps %xmm0, %xmm0
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebx, %eax
-; FALLBACK18-NEXT: shlb $3, %al
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: andb $28, %bl
-; FALLBACK18-NEXT: movzbl %bl, %edi
-; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %eax, %esi, %edx
-; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl %eax, %edx
-; FALLBACK18-NEXT: movl %eax, %ebx
-; FALLBACK18-NEXT: notb %dl
-; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp
-; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl %ebx, %ecx
-; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx
-; FALLBACK18-NEXT: addl %esi, %esi
-; FALLBACK18-NEXT: shlxl %edx, %esi, %eax
-; FALLBACK18-NEXT: orl %ebx, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: leal (%eax,%eax), %ebx
-; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi
-; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp
-; FALLBACK18-NEXT: movl %ecx, %eax
-; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx
-; FALLBACK18-NEXT: orl %ebx, %esi
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; FALLBACK18-NEXT: movl %eax, %ebx
-; FALLBACK18-NEXT: addl %ebp, %ebp
-; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT: orl %ecx, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp
-; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
-; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax
-; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi
-; FALLBACK18-NEXT: orl %esi, %ecx
-; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: addl %eax, %eax
-; FALLBACK18-NEXT: shlxl %edx, %eax, %esi
-; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax
-; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi
-; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %edx, %edi, %edi
-; FALLBACK18-NEXT: orl %eax, %edi
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl %ebx, 28(%eax)
-; FALLBACK18-NEXT: movl %edi, 24(%eax)
-; FALLBACK18-NEXT: movl %esi, 16(%eax)
-; FALLBACK18-NEXT: movl %ecx, 20(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 8(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 12(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, (%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 4(%eax)
-; FALLBACK18-NEXT: addl $108, %esp
-; FALLBACK18-NEXT: popl %esi
-; FALLBACK18-NEXT: popl %edi
-; FALLBACK18-NEXT: popl %ebx
-; FALLBACK18-NEXT: popl %ebp
-; FALLBACK18-NEXT: retl
-;
-; FALLBACK19-LABEL: lshr_32bytes:
-; FALLBACK19: # %bb.0:
-; FALLBACK19-NEXT: pushl %ebp
-; FALLBACK19-NEXT: pushl %ebx
-; FALLBACK19-NEXT: pushl %edi
-; FALLBACK19-NEXT: pushl %esi
-; FALLBACK19-NEXT: subl $92, %esp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK19-NEXT: movl (%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 4(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: movl 8(%ecx), %esi
-; FALLBACK19-NEXT: movl 12(%ecx), %edi
-; FALLBACK19-NEXT: movl 16(%ecx), %ebp
-; FALLBACK19-NEXT: movzbl (%ebx), %ebx
-; FALLBACK19-NEXT: movl 20(%ecx), %edx
-; FALLBACK19-NEXT: movl 24(%ecx), %eax
-; FALLBACK19-NEXT: movl 28(%ecx), %ecx
-; FALLBACK19-NEXT: xorps %xmm0, %xmm0
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebx, %ecx
-; FALLBACK19-NEXT: shlb $3, %cl
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: andb $28, %bl
-; FALLBACK19-NEXT: movzbl %bl, %ebp
-; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi
-; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %esi, %eax
-; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx
-; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx
-; FALLBACK19-NEXT: movl %edx, %esi
-; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx
-; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi
-; FALLBACK19-NEXT: shrdl %cl, %edi, %eax
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK19-NEXT: movl %eax, 24(%ebp)
-; FALLBACK19-NEXT: shrxl %ecx, %edi, %eax
-; FALLBACK19-NEXT: movl %eax, 28(%ebp)
-; FALLBACK19-NEXT: movl %ebx, 16(%ebp)
-; FALLBACK19-NEXT: movl %esi, 20(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 8(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 12(%ebp)
-; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK19-NEXT: movl %edx, (%ebp)
-; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 4(%ebp)
-; FALLBACK19-NEXT: addl $92, %esp
-; FALLBACK19-NEXT: popl %esi
-; FALLBACK19-NEXT: popl %edi
-; FALLBACK19-NEXT: popl %ebx
-; FALLBACK19-NEXT: popl %ebp
-; FALLBACK19-NEXT: retl
-;
-; FALLBACK20-LABEL: lshr_32bytes:
-; FALLBACK20: # %bb.0:
-; FALLBACK20-NEXT: pushl %ebp
-; FALLBACK20-NEXT: pushl %ebx
-; FALLBACK20-NEXT: pushl %edi
-; FALLBACK20-NEXT: pushl %esi
-; FALLBACK20-NEXT: subl $108, %esp
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: movups (%ecx), %xmm0
-; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK20-NEXT: movzbl (%eax), %ecx
-; FALLBACK20-NEXT: movl %ecx, %eax
-; FALLBACK20-NEXT: shlb $3, %al
-; FALLBACK20-NEXT: xorps %xmm2, %xmm2
-; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: andb $28, %cl
-; FALLBACK20-NEXT: movzbl %cl, %edi
-; FALLBACK20-NEXT: movl 32(%esp,%edi), %esi
-; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %esi
-; FALLBACK20-NEXT: movl %eax, %edx
-; FALLBACK20-NEXT: notb %dl
-; FALLBACK20-NEXT: addl %ebx, %ebx
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %esi, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebp
-; FALLBACK20-NEXT: movl %ebp, %esi
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %esi
-; FALLBACK20-NEXT: movl 48(%esp,%edi), %ecx
-; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %esi, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi
-; FALLBACK20-NEXT: movl %esi, %ebx
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: addl %ebp, %ebp
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebp
-; FALLBACK20-NEXT: orl %ebx, %ebp
-; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp
-; FALLBACK20-NEXT: movl %ebp, %ebx
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx
-; FALLBACK20-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %edi
-; FALLBACK20-NEXT: orl %ebx, %edi
-; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: addl %ebp, %ebp
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebp
-; FALLBACK20-NEXT: orl %edi, %ebp
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl 60(%esp,%ecx), %ebx
-; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %edi
-; FALLBACK20-NEXT: orl (%esp), %edi # 4-byte Folded Reload
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; FALLBACK20-NEXT: addl %esi, %esi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl %ebx, 28(%eax)
-; FALLBACK20-NEXT: movl %esi, 4(%eax)
-; FALLBACK20-NEXT: movl %edi, 24(%eax)
-; FALLBACK20-NEXT: movl %ebp, 16(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 20(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 8(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 12(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, (%eax)
-; FALLBACK20-NEXT: addl $108, %esp
-; FALLBACK20-NEXT: popl %esi
-; FALLBACK20-NEXT: popl %edi
-; FALLBACK20-NEXT: popl %ebx
-; FALLBACK20-NEXT: popl %ebp
-; FALLBACK20-NEXT: retl
-;
-; FALLBACK21-LABEL: lshr_32bytes:
-; FALLBACK21: # %bb.0:
-; FALLBACK21-NEXT: pushl %ebp
-; FALLBACK21-NEXT: pushl %ebx
-; FALLBACK21-NEXT: pushl %edi
-; FALLBACK21-NEXT: pushl %esi
-; FALLBACK21-NEXT: subl $108, %esp
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK21-NEXT: movups (%ecx), %xmm0
-; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK21-NEXT: movzbl (%eax), %eax
-; FALLBACK21-NEXT: movl %eax, %ecx
-; FALLBACK21-NEXT: shlb $3, %cl
-; FALLBACK21-NEXT: xorps %xmm2, %xmm2
-; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: andb $28, %al
-; FALLBACK21-NEXT: movzbl %al, %ebp
-; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
-; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx
-; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi
-; FALLBACK21-NEXT: movl %edi, %esi
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK21-NEXT: movl %esi, 4(%ebp)
-; FALLBACK21-NEXT: movl %ebx, 24(%ebp)
-; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK21-NEXT: shrl %cl, %eax
-; FALLBACK21-NEXT: movl %eax, 28(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 16(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 20(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 8(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 12(%ebp)
-; FALLBACK21-NEXT: movl %edx, (%ebp)
-; FALLBACK21-NEXT: addl $108, %esp
-; FALLBACK21-NEXT: popl %esi
-; FALLBACK21-NEXT: popl %edi
-; FALLBACK21-NEXT: popl %ebx
-; FALLBACK21-NEXT: popl %ebp
-; FALLBACK21-NEXT: retl
-;
-; FALLBACK22-LABEL: lshr_32bytes:
-; FALLBACK22: # %bb.0:
-; FALLBACK22-NEXT: pushl %ebp
-; FALLBACK22-NEXT: pushl %ebx
-; FALLBACK22-NEXT: pushl %edi
-; FALLBACK22-NEXT: pushl %esi
-; FALLBACK22-NEXT: subl $108, %esp
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK22-NEXT: movups (%ecx), %xmm0
-; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK22-NEXT: movzbl (%eax), %ecx
-; FALLBACK22-NEXT: movl %ecx, %edx
-; FALLBACK22-NEXT: shlb $3, %dl
-; FALLBACK22-NEXT: xorps %xmm2, %xmm2
-; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: andb $28, %cl
-; FALLBACK22-NEXT: movzbl %cl, %edi
-; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK22-NEXT: movl %edx, %eax
-; FALLBACK22-NEXT: notb %al
-; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: addl %esi, %esi
-; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
-; FALLBACK22-NEXT: orl %ecx, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: addl %ecx, %ecx
-; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi
-; FALLBACK22-NEXT: movl %eax, %ebp
-; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx
-; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx
-; FALLBACK22-NEXT: orl %ebx, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: addl %ecx, %ecx
-; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi
-; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx
-; FALLBACK22-NEXT: orl %ebx, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi
-; FALLBACK22-NEXT: leal (%esi,%esi), %ebx
-; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax
-; FALLBACK22-NEXT: movl %ebp, %ecx
-; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx
-; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK22-NEXT: addl %ebx, %ebx
-; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx
-; FALLBACK22-NEXT: orl %ebp, %ebx
-; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi
-; FALLBACK22-NEXT: shrxl %edx, %edi, %eax
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: movl %ecx, %edx
-; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi
-; FALLBACK22-NEXT: orl %ebp, %edi
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: addl %ecx, %ecx
-; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx
-; FALLBACK22-NEXT: orl %esi, %ecx
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK22-NEXT: movl %eax, 28(%edx)
-; FALLBACK22-NEXT: movl %ecx, 4(%edx)
-; FALLBACK22-NEXT: movl %edi, 24(%edx)
-; FALLBACK22-NEXT: movl %ebx, 16(%edx)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, 20(%edx)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, 8(%edx)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, 12(%edx)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, (%edx)
-; FALLBACK22-NEXT: addl $108, %esp
-; FALLBACK22-NEXT: popl %esi
-; FALLBACK22-NEXT: popl %edi
-; FALLBACK22-NEXT: popl %ebx
-; FALLBACK22-NEXT: popl %ebp
-; FALLBACK22-NEXT: retl
-;
-; FALLBACK23-LABEL: lshr_32bytes:
-; FALLBACK23: # %bb.0:
-; FALLBACK23-NEXT: pushl %ebp
-; FALLBACK23-NEXT: pushl %ebx
-; FALLBACK23-NEXT: pushl %edi
-; FALLBACK23-NEXT: pushl %esi
-; FALLBACK23-NEXT: subl $108, %esp
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK23-NEXT: movups (%ecx), %xmm0
-; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK23-NEXT: movzbl (%eax), %eax
-; FALLBACK23-NEXT: movl %eax, %ecx
-; FALLBACK23-NEXT: shlb $3, %cl
-; FALLBACK23-NEXT: xorps %xmm2, %xmm2
-; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: andb $28, %al
-; FALLBACK23-NEXT: movzbl %al, %ebx
-; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
-; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp
-; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax
-; FALLBACK23-NEXT: movl %eax, %edi
-; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi
-; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax
-; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp
-; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx
-; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx
-; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movl %ebx, 4(%eax)
-; FALLBACK23-NEXT: movl %ebp, 24(%eax)
-; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; FALLBACK23-NEXT: movl %ebx, 28(%eax)
-; FALLBACK23-NEXT: movl %esi, 16(%eax)
-; FALLBACK23-NEXT: movl %edi, 20(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK23-NEXT: movl %esi, 8(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK23-NEXT: movl %esi, 12(%eax)
-; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK23-NEXT: movl %edx, (%eax)
-; FALLBACK23-NEXT: addl $108, %esp
-; FALLBACK23-NEXT: popl %esi
-; FALLBACK23-NEXT: popl %edi
-; FALLBACK23-NEXT: popl %ebx
-; FALLBACK23-NEXT: popl %ebp
-; FALLBACK23-NEXT: retl
-;
-; FALLBACK24-LABEL: lshr_32bytes:
-; FALLBACK24: # %bb.0:
-; FALLBACK24-NEXT: pushl %ebp
-; FALLBACK24-NEXT: pushl %ebx
-; FALLBACK24-NEXT: pushl %edi
-; FALLBACK24-NEXT: pushl %esi
-; FALLBACK24-NEXT: subl $108, %esp
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT: movzbl (%eax), %ecx
-; FALLBACK24-NEXT: movl %ecx, %eax
-; FALLBACK24-NEXT: shlb $3, %al
-; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: andb $28, %cl
-; FALLBACK24-NEXT: movzbl %cl, %edi
-; FALLBACK24-NEXT: movl 32(%esp,%edi), %esi
-; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %esi
-; FALLBACK24-NEXT: movl %eax, %edx
-; FALLBACK24-NEXT: notb %dl
-; FALLBACK24-NEXT: addl %ebx, %ebx
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %esi, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebp
-; FALLBACK24-NEXT: movl %ebp, %esi
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %esi
-; FALLBACK24-NEXT: movl 48(%esp,%edi), %ecx
-; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %esi, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi
-; FALLBACK24-NEXT: movl %esi, %ebx
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: addl %ebp, %ebp
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebp
-; FALLBACK24-NEXT: orl %ebx, %ebp
-; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp
-; FALLBACK24-NEXT: movl %ebp, %ebx
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx
-; FALLBACK24-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %edi
-; FALLBACK24-NEXT: orl %ebx, %edi
-; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: addl %ebp, %ebp
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebp
-; FALLBACK24-NEXT: orl %edi, %ebp
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl 60(%esp,%ecx), %ebx
-; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %edi
-; FALLBACK24-NEXT: orl (%esp), %edi # 4-byte Folded Reload
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; FALLBACK24-NEXT: addl %esi, %esi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl %ebx, 28(%eax)
-; FALLBACK24-NEXT: movl %esi, 4(%eax)
-; FALLBACK24-NEXT: movl %edi, 24(%eax)
-; FALLBACK24-NEXT: movl %ebp, 16(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 20(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 8(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 12(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, (%eax)
-; FALLBACK24-NEXT: addl $108, %esp
-; FALLBACK24-NEXT: popl %esi
-; FALLBACK24-NEXT: popl %edi
-; FALLBACK24-NEXT: popl %ebx
-; FALLBACK24-NEXT: popl %ebp
-; FALLBACK24-NEXT: vzeroupper
-; FALLBACK24-NEXT: retl
-;
-; FALLBACK25-LABEL: lshr_32bytes:
-; FALLBACK25: # %bb.0:
-; FALLBACK25-NEXT: pushl %ebp
-; FALLBACK25-NEXT: pushl %ebx
-; FALLBACK25-NEXT: pushl %edi
-; FALLBACK25-NEXT: pushl %esi
-; FALLBACK25-NEXT: subl $108, %esp
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK25-NEXT: movzbl (%eax), %eax
-; FALLBACK25-NEXT: movl %eax, %ecx
-; FALLBACK25-NEXT: shlb $3, %cl
-; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: andb $28, %al
-; FALLBACK25-NEXT: movzbl %al, %ebp
-; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
-; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx
-; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi
-; FALLBACK25-NEXT: movl %edi, %esi
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK25-NEXT: movl %esi, 4(%ebp)
-; FALLBACK25-NEXT: movl %ebx, 24(%ebp)
-; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK25-NEXT: shrl %cl, %eax
-; FALLBACK25-NEXT: movl %eax, 28(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 16(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 20(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 8(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 12(%ebp)
-; FALLBACK25-NEXT: movl %edx, (%ebp)
-; FALLBACK25-NEXT: addl $108, %esp
-; FALLBACK25-NEXT: popl %esi
-; FALLBACK25-NEXT: popl %edi
-; FALLBACK25-NEXT: popl %ebx
-; FALLBACK25-NEXT: popl %ebp
-; FALLBACK25-NEXT: vzeroupper
-; FALLBACK25-NEXT: retl
-;
-; FALLBACK26-LABEL: lshr_32bytes:
-; FALLBACK26: # %bb.0:
-; FALLBACK26-NEXT: pushl %ebp
-; FALLBACK26-NEXT: pushl %ebx
-; FALLBACK26-NEXT: pushl %edi
-; FALLBACK26-NEXT: pushl %esi
-; FALLBACK26-NEXT: subl $108, %esp
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK26-NEXT: movzbl (%eax), %ecx
-; FALLBACK26-NEXT: movl %ecx, %edx
-; FALLBACK26-NEXT: shlb $3, %dl
-; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: andb $28, %cl
-; FALLBACK26-NEXT: movzbl %cl, %edi
-; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK26-NEXT: movl %edx, %eax
-; FALLBACK26-NEXT: notb %al
-; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: addl %esi, %esi
-; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
-; FALLBACK26-NEXT: orl %ecx, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: addl %ecx, %ecx
-; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi
-; FALLBACK26-NEXT: movl %eax, %ebp
-; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx
-; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx
-; FALLBACK26-NEXT: orl %ebx, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: addl %ecx, %ecx
-; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi
-; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx
-; FALLBACK26-NEXT: orl %ebx, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi
-; FALLBACK26-NEXT: leal (%esi,%esi), %ebx
-; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax
-; FALLBACK26-NEXT: movl %ebp, %ecx
-; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx
-; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK26-NEXT: addl %ebx, %ebx
-; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx
-; FALLBACK26-NEXT: orl %ebp, %ebx
-; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi
-; FALLBACK26-NEXT: shrxl %edx, %edi, %eax
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: movl %ecx, %edx
-; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi
-; FALLBACK26-NEXT: orl %ebp, %edi
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: addl %ecx, %ecx
-; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx
-; FALLBACK26-NEXT: orl %esi, %ecx
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK26-NEXT: movl %eax, 28(%edx)
-; FALLBACK26-NEXT: movl %ecx, 4(%edx)
-; FALLBACK26-NEXT: movl %edi, 24(%edx)
-; FALLBACK26-NEXT: movl %ebx, 16(%edx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 20(%edx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 8(%edx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 12(%edx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, (%edx)
-; FALLBACK26-NEXT: addl $108, %esp
-; FALLBACK26-NEXT: popl %esi
-; FALLBACK26-NEXT: popl %edi
-; FALLBACK26-NEXT: popl %ebx
-; FALLBACK26-NEXT: popl %ebp
-; FALLBACK26-NEXT: vzeroupper
-; FALLBACK26-NEXT: retl
-;
-; FALLBACK27-LABEL: lshr_32bytes:
-; FALLBACK27: # %bb.0:
-; FALLBACK27-NEXT: pushl %ebp
-; FALLBACK27-NEXT: pushl %ebx
-; FALLBACK27-NEXT: pushl %edi
-; FALLBACK27-NEXT: pushl %esi
-; FALLBACK27-NEXT: subl $108, %esp
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK27-NEXT: movzbl (%eax), %eax
-; FALLBACK27-NEXT: movl %eax, %ecx
-; FALLBACK27-NEXT: shlb $3, %cl
-; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: andb $28, %al
-; FALLBACK27-NEXT: movzbl %al, %ebx
-; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
-; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp
-; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax
-; FALLBACK27-NEXT: movl %eax, %edi
-; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi
-; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax
-; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp
-; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx
-; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx
-; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: movl %ebx, 4(%eax)
-; FALLBACK27-NEXT: movl %ebp, 24(%eax)
-; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; FALLBACK27-NEXT: movl %ebx, 28(%eax)
-; FALLBACK27-NEXT: movl %esi, 16(%eax)
-; FALLBACK27-NEXT: movl %edi, 20(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK27-NEXT: movl %esi, 8(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK27-NEXT: movl %esi, 12(%eax)
-; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK27-NEXT: movl %edx, (%eax)
-; FALLBACK27-NEXT: addl $108, %esp
-; FALLBACK27-NEXT: popl %esi
-; FALLBACK27-NEXT: popl %edi
-; FALLBACK27-NEXT: popl %ebx
-; FALLBACK27-NEXT: popl %ebp
-; FALLBACK27-NEXT: vzeroupper
-; FALLBACK27-NEXT: retl
-;
-; FALLBACK28-LABEL: lshr_32bytes:
-; FALLBACK28: # %bb.0:
-; FALLBACK28-NEXT: pushl %ebp
-; FALLBACK28-NEXT: pushl %ebx
-; FALLBACK28-NEXT: pushl %edi
-; FALLBACK28-NEXT: pushl %esi
-; FALLBACK28-NEXT: subl $108, %esp
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK28-NEXT: movzbl (%eax), %ecx
-; FALLBACK28-NEXT: movl %ecx, %eax
-; FALLBACK28-NEXT: shlb $3, %al
-; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: andb $28, %cl
-; FALLBACK28-NEXT: movzbl %cl, %edi
-; FALLBACK28-NEXT: movl 32(%esp,%edi), %esi
-; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %esi
-; FALLBACK28-NEXT: movl %eax, %edx
-; FALLBACK28-NEXT: notb %dl
-; FALLBACK28-NEXT: addl %ebx, %ebx
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %esi, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebp
-; FALLBACK28-NEXT: movl %ebp, %esi
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %esi
-; FALLBACK28-NEXT: movl 48(%esp,%edi), %ecx
-; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %esi, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi
-; FALLBACK28-NEXT: movl %esi, %ebx
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: addl %ebp, %ebp
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebp
-; FALLBACK28-NEXT: orl %ebx, %ebp
-; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp
-; FALLBACK28-NEXT: movl %ebp, %ebx
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx
-; FALLBACK28-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %edi
-; FALLBACK28-NEXT: orl %ebx, %edi
-; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: addl %ebp, %ebp
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebp
-; FALLBACK28-NEXT: orl %edi, %ebp
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl 60(%esp,%ecx), %ebx
-; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %edi
-; FALLBACK28-NEXT: orl (%esp), %edi # 4-byte Folded Reload
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; FALLBACK28-NEXT: addl %esi, %esi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl %ebx, 28(%eax)
-; FALLBACK28-NEXT: movl %esi, 4(%eax)
-; FALLBACK28-NEXT: movl %edi, 24(%eax)
-; FALLBACK28-NEXT: movl %ebp, 16(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 20(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 8(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 12(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, (%eax)
-; FALLBACK28-NEXT: addl $108, %esp
-; FALLBACK28-NEXT: popl %esi
-; FALLBACK28-NEXT: popl %edi
-; FALLBACK28-NEXT: popl %ebx
-; FALLBACK28-NEXT: popl %ebp
-; FALLBACK28-NEXT: vzeroupper
-; FALLBACK28-NEXT: retl
-;
-; FALLBACK29-LABEL: lshr_32bytes:
-; FALLBACK29: # %bb.0:
-; FALLBACK29-NEXT: pushl %ebp
-; FALLBACK29-NEXT: pushl %ebx
-; FALLBACK29-NEXT: pushl %edi
-; FALLBACK29-NEXT: pushl %esi
-; FALLBACK29-NEXT: subl $108, %esp
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK29-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK29-NEXT: movzbl (%eax), %eax
-; FALLBACK29-NEXT: movl %eax, %ecx
-; FALLBACK29-NEXT: shlb $3, %cl
-; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: andb $28, %al
-; FALLBACK29-NEXT: movzbl %al, %ebp
-; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
-; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx
-; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi
-; FALLBACK29-NEXT: movl %edi, %esi
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK29-NEXT: movl %esi, 4(%ebp)
-; FALLBACK29-NEXT: movl %ebx, 24(%ebp)
-; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK29-NEXT: shrl %cl, %eax
-; FALLBACK29-NEXT: movl %eax, 28(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 16(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 20(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 8(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 12(%ebp)
-; FALLBACK29-NEXT: movl %edx, (%ebp)
-; FALLBACK29-NEXT: addl $108, %esp
-; FALLBACK29-NEXT: popl %esi
-; FALLBACK29-NEXT: popl %edi
-; FALLBACK29-NEXT: popl %ebx
-; FALLBACK29-NEXT: popl %ebp
-; FALLBACK29-NEXT: vzeroupper
-; FALLBACK29-NEXT: retl
-;
-; FALLBACK30-LABEL: lshr_32bytes:
-; FALLBACK30: # %bb.0:
-; FALLBACK30-NEXT: pushl %ebp
-; FALLBACK30-NEXT: pushl %ebx
-; FALLBACK30-NEXT: pushl %edi
-; FALLBACK30-NEXT: pushl %esi
-; FALLBACK30-NEXT: subl $108, %esp
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK30-NEXT: movzbl (%eax), %ecx
-; FALLBACK30-NEXT: movl %ecx, %edx
-; FALLBACK30-NEXT: shlb $3, %dl
-; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: andb $28, %cl
-; FALLBACK30-NEXT: movzbl %cl, %edi
-; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx
-; FALLBACK30-NEXT: movl %edx, %eax
-; FALLBACK30-NEXT: notb %al
-; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: addl %esi, %esi
-; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
-; FALLBACK30-NEXT: orl %ecx, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: addl %ecx, %ecx
-; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi
-; FALLBACK30-NEXT: movl %eax, %ebp
-; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx
-; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx
-; FALLBACK30-NEXT: orl %ebx, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: addl %ecx, %ecx
-; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi
-; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx
-; FALLBACK30-NEXT: orl %ebx, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi
-; FALLBACK30-NEXT: leal (%esi,%esi), %ebx
-; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax
-; FALLBACK30-NEXT: movl %ebp, %ecx
-; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx
-; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK30-NEXT: addl %ebx, %ebx
-; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx
-; FALLBACK30-NEXT: orl %ebp, %ebx
-; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi
-; FALLBACK30-NEXT: shrxl %edx, %edi, %eax
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: movl %ecx, %edx
-; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi
-; FALLBACK30-NEXT: orl %ebp, %edi
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: addl %ecx, %ecx
-; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx
-; FALLBACK30-NEXT: orl %esi, %ecx
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK30-NEXT: movl %eax, 28(%edx)
-; FALLBACK30-NEXT: movl %ecx, 4(%edx)
-; FALLBACK30-NEXT: movl %edi, 24(%edx)
-; FALLBACK30-NEXT: movl %ebx, 16(%edx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 20(%edx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 8(%edx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 12(%edx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, (%edx)
-; FALLBACK30-NEXT: addl $108, %esp
-; FALLBACK30-NEXT: popl %esi
-; FALLBACK30-NEXT: popl %edi
-; FALLBACK30-NEXT: popl %ebx
-; FALLBACK30-NEXT: popl %ebp
-; FALLBACK30-NEXT: vzeroupper
-; FALLBACK30-NEXT: retl
-;
-; FALLBACK31-LABEL: lshr_32bytes:
-; FALLBACK31: # %bb.0:
-; FALLBACK31-NEXT: pushl %ebp
-; FALLBACK31-NEXT: pushl %ebx
-; FALLBACK31-NEXT: pushl %edi
-; FALLBACK31-NEXT: pushl %esi
-; FALLBACK31-NEXT: subl $108, %esp
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK31-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK31-NEXT: movzbl (%eax), %eax
-; FALLBACK31-NEXT: movl %eax, %ecx
-; FALLBACK31-NEXT: shlb $3, %cl
-; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: andb $28, %al
-; FALLBACK31-NEXT: movzbl %al, %ebx
-; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
-; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp
-; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax
-; FALLBACK31-NEXT: movl %eax, %edi
-; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi
-; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax
-; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp
-; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx
-; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx
-; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: movl %ebx, 4(%eax)
-; FALLBACK31-NEXT: movl %ebp, 24(%eax)
-; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; FALLBACK31-NEXT: movl %ebx, 28(%eax)
-; FALLBACK31-NEXT: movl %esi, 16(%eax)
-; FALLBACK31-NEXT: movl %edi, 20(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK31-NEXT: movl %esi, 8(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK31-NEXT: movl %esi, 12(%eax)
-; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK31-NEXT: movl %edx, (%eax)
-; FALLBACK31-NEXT: addl $108, %esp
-; FALLBACK31-NEXT: popl %esi
-; FALLBACK31-NEXT: popl %edi
-; FALLBACK31-NEXT: popl %ebx
-; FALLBACK31-NEXT: popl %ebp
-; FALLBACK31-NEXT: vzeroupper
-; FALLBACK31-NEXT: retl
+; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes:
+; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes:
+; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %r9d
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %cl
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %cl, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -72(%rsp,%rcx), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rcx), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rcx), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rcx), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rcx,%rcx), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rcx, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes:
+; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rcx,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %r9d
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -72(%rsp,%rax), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ebp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %dh
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %dh
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ah, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%edi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%eax), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%eax), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%esi,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ebp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %ch, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %bl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%ebp,%ebp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, 32(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 20(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %bl, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes:
+; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, 32(%esp,%ebx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ebx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edi, %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ebx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edi, %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edi, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 20(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes:
+; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %al
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl (%esp), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebp, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, 32(%esp,%ebx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ebx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edi, %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ebx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edi, %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edi, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 20(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %al
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 32(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
%bitOff = shl i256 %byteOff, 3
@@ -5055,591 +4208,452 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
}
define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
-; FALLBACK0-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK0: # %bb.0:
-; FALLBACK0-NEXT: pushq %rbx
-; FALLBACK0-NEXT: movq (%rdi), %rcx
-; FALLBACK0-NEXT: movq 8(%rdi), %r8
-; FALLBACK0-NEXT: movq 16(%rdi), %r9
-; FALLBACK0-NEXT: movq 24(%rdi), %rdi
-; FALLBACK0-NEXT: movzbl (%rsi), %esi
-; FALLBACK0-NEXT: movl %esi, %eax
-; FALLBACK0-NEXT: shlb $5, %al
-; FALLBACK0-NEXT: xorps %xmm0, %xmm0
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: andb $6, %sil
-; FALLBACK0-NEXT: movzbl %sil, %r9d
-; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10
-; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi
-; FALLBACK0-NEXT: movq %rdi, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r11
-; FALLBACK0-NEXT: movl %eax, %esi
-; FALLBACK0-NEXT: notb %sil
-; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx
-; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r8
-; FALLBACK0-NEXT: orq %r11, %r8
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r10
-; FALLBACK0-NEXT: addq %rdi, %rdi
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %rdi
-; FALLBACK0-NEXT: orq %r10, %rdi
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rbx
-; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9
-; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r10
-; FALLBACK0-NEXT: orq %rbx, %r10
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r9
-; FALLBACK0-NEXT: movq %r9, 24(%rdx)
-; FALLBACK0-NEXT: movq %r10, 16(%rdx)
-; FALLBACK0-NEXT: movq %rdi, (%rdx)
-; FALLBACK0-NEXT: movq %r8, 8(%rdx)
-; FALLBACK0-NEXT: popq %rbx
-; FALLBACK0-NEXT: retq
-;
-; FALLBACK1-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK1: # %bb.0:
-; FALLBACK1-NEXT: movq (%rdi), %rax
-; FALLBACK1-NEXT: movq 8(%rdi), %r8
-; FALLBACK1-NEXT: movq 16(%rdi), %r9
-; FALLBACK1-NEXT: movq 24(%rdi), %rdi
-; FALLBACK1-NEXT: movzbl (%rsi), %esi
-; FALLBACK1-NEXT: movl %esi, %ecx
-; FALLBACK1-NEXT: shlb $5, %cl
-; FALLBACK1-NEXT: xorps %xmm0, %xmm0
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: andb $6, %sil
-; FALLBACK1-NEXT: movzbl %sil, %eax
-; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi
-; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi
-; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8
-; FALLBACK1-NEXT: movq %r8, %r9
-; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
-; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax
-; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
-; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
-; FALLBACK1-NEXT: shrq %cl, %rax
-; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK1-NEXT: movq %rax, 24(%rdx)
-; FALLBACK1-NEXT: movq %rdi, (%rdx)
-; FALLBACK1-NEXT: movq %r9, 8(%rdx)
-; FALLBACK1-NEXT: retq
-;
-; FALLBACK2-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK2: # %bb.0:
-; FALLBACK2-NEXT: movq (%rdi), %rcx
-; FALLBACK2-NEXT: movq 8(%rdi), %r8
-; FALLBACK2-NEXT: movq 16(%rdi), %r9
-; FALLBACK2-NEXT: movq 24(%rdi), %rdi
-; FALLBACK2-NEXT: movzbl (%rsi), %esi
-; FALLBACK2-NEXT: movl %esi, %eax
-; FALLBACK2-NEXT: shlb $5, %al
-; FALLBACK2-NEXT: xorps %xmm0, %xmm0
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: andb $6, %sil
-; FALLBACK2-NEXT: movzbl %sil, %ecx
-; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi
-; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi
-; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9
-; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11
-; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK2-NEXT: notb %al
-; FALLBACK2-NEXT: addq %rdi, %rdi
-; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT: orq %r8, %rdi
-; FALLBACK2-NEXT: addq %rsi, %rsi
-; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT: orq %r9, %rsi
-; FALLBACK2-NEXT: addq %rcx, %rcx
-; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT: orq %r10, %rax
-; FALLBACK2-NEXT: movq %r11, 24(%rdx)
-; FALLBACK2-NEXT: movq %rax, 16(%rdx)
-; FALLBACK2-NEXT: movq %rsi, (%rdx)
-; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
-; FALLBACK2-NEXT: retq
-;
-; FALLBACK3-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK3: # %bb.0:
-; FALLBACK3-NEXT: movq (%rdi), %rax
-; FALLBACK3-NEXT: movq 8(%rdi), %r8
-; FALLBACK3-NEXT: movq 16(%rdi), %r9
-; FALLBACK3-NEXT: movq 24(%rdi), %rdi
-; FALLBACK3-NEXT: movzbl (%rsi), %esi
-; FALLBACK3-NEXT: movl %esi, %ecx
-; FALLBACK3-NEXT: shlb $5, %cl
-; FALLBACK3-NEXT: xorps %xmm0, %xmm0
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: andb $6, %sil
-; FALLBACK3-NEXT: movzbl %sil, %eax
-; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi
-; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi
-; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8
-; FALLBACK3-NEXT: movq %r8, %r9
-; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
-; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax
-; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
-; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
-; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
-; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK3-NEXT: movq %rax, 24(%rdx)
-; FALLBACK3-NEXT: movq %rdi, (%rdx)
-; FALLBACK3-NEXT: movq %r9, 8(%rdx)
-; FALLBACK3-NEXT: retq
-;
-; FALLBACK4-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK4: # %bb.0:
-; FALLBACK4-NEXT: pushq %rbx
-; FALLBACK4-NEXT: movups (%rdi), %xmm0
-; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK4-NEXT: movzbl (%rsi), %ecx
-; FALLBACK4-NEXT: movl %ecx, %eax
-; FALLBACK4-NEXT: shlb $5, %al
-; FALLBACK4-NEXT: xorps %xmm2, %xmm2
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: andb $6, %cl
-; FALLBACK4-NEXT: movzbl %cl, %r9d
-; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10
-; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r10
-; FALLBACK4-NEXT: movl %eax, %esi
-; FALLBACK4-NEXT: notb %sil
-; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rdi
-; FALLBACK4-NEXT: orq %r10, %rdi
-; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10
-; FALLBACK4-NEXT: movq %r10, %r11
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r11
-; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9
-; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rbx
-; FALLBACK4-NEXT: orq %r11, %rbx
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r8
-; FALLBACK4-NEXT: addq %r10, %r10
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r10
-; FALLBACK4-NEXT: orq %r8, %r10
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r9
-; FALLBACK4-NEXT: movq %r9, 24(%rdx)
-; FALLBACK4-NEXT: movq %r10, 8(%rdx)
-; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK4-NEXT: movq %rdi, (%rdx)
-; FALLBACK4-NEXT: popq %rbx
-; FALLBACK4-NEXT: retq
-;
-; FALLBACK5-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK5: # %bb.0:
-; FALLBACK5-NEXT: movups (%rdi), %xmm0
-; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK5-NEXT: movzbl (%rsi), %eax
-; FALLBACK5-NEXT: movl %eax, %ecx
-; FALLBACK5-NEXT: shlb $5, %cl
-; FALLBACK5-NEXT: xorps %xmm2, %xmm2
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: andb $6, %al
-; FALLBACK5-NEXT: movzbl %al, %eax
-; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK5-NEXT: movq %rdi, %r8
-; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK5-NEXT: movq %rax, %r10
-; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK5-NEXT: shrq %cl, %rsi
-; FALLBACK5-NEXT: movq %r10, 8(%rdx)
-; FALLBACK5-NEXT: movq %r8, 16(%rdx)
-; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK5-NEXT: movq %r9, (%rdx)
-; FALLBACK5-NEXT: retq
-;
-; FALLBACK6-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK6: # %bb.0:
-; FALLBACK6-NEXT: movups (%rdi), %xmm0
-; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT: movzbl (%rsi), %ecx
-; FALLBACK6-NEXT: movl %ecx, %eax
-; FALLBACK6-NEXT: shlb $5, %al
-; FALLBACK6-NEXT: xorps %xmm2, %xmm2
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: andb $6, %cl
-; FALLBACK6-NEXT: movzbl %cl, %ecx
-; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8
-; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11
-; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK6-NEXT: notb %al
-; FALLBACK6-NEXT: addq %rdi, %rdi
-; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT: orq %rsi, %rdi
-; FALLBACK6-NEXT: addq %rcx, %rcx
-; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT: orq %r9, %rcx
-; FALLBACK6-NEXT: addq %r8, %r8
-; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT: orq %r10, %rax
-; FALLBACK6-NEXT: movq %r11, 24(%rdx)
-; FALLBACK6-NEXT: movq %rax, 8(%rdx)
-; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT: movq %rdi, (%rdx)
-; FALLBACK6-NEXT: retq
-;
-; FALLBACK7-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK7: # %bb.0:
-; FALLBACK7-NEXT: movups (%rdi), %xmm0
-; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK7-NEXT: movzbl (%rsi), %eax
-; FALLBACK7-NEXT: movl %eax, %ecx
-; FALLBACK7-NEXT: shlb $5, %cl
-; FALLBACK7-NEXT: xorps %xmm2, %xmm2
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: andb $6, %al
-; FALLBACK7-NEXT: movzbl %al, %eax
-; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK7-NEXT: movq %rdi, %r8
-; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK7-NEXT: movq %rax, %r10
-; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax
-; FALLBACK7-NEXT: movq %r10, 8(%rdx)
-; FALLBACK7-NEXT: movq %r8, 16(%rdx)
-; FALLBACK7-NEXT: movq %rax, 24(%rdx)
-; FALLBACK7-NEXT: movq %r9, (%rdx)
-; FALLBACK7-NEXT: retq
-;
-; FALLBACK8-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK8: # %bb.0:
-; FALLBACK8-NEXT: pushq %rbx
-; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK8-NEXT: movzbl (%rsi), %ecx
-; FALLBACK8-NEXT: movl %ecx, %eax
-; FALLBACK8-NEXT: shlb $5, %al
-; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: andb $6, %cl
-; FALLBACK8-NEXT: movzbl %cl, %r9d
-; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10
-; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r10
-; FALLBACK8-NEXT: movl %eax, %esi
-; FALLBACK8-NEXT: notb %sil
-; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rdi
-; FALLBACK8-NEXT: orq %r10, %rdi
-; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10
-; FALLBACK8-NEXT: movq %r10, %r11
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r11
-; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9
-; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rbx
-; FALLBACK8-NEXT: orq %r11, %rbx
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r8
-; FALLBACK8-NEXT: addq %r10, %r10
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r10
-; FALLBACK8-NEXT: orq %r8, %r10
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r9
-; FALLBACK8-NEXT: movq %r9, 24(%rdx)
-; FALLBACK8-NEXT: movq %r10, 8(%rdx)
-; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK8-NEXT: movq %rdi, (%rdx)
-; FALLBACK8-NEXT: popq %rbx
-; FALLBACK8-NEXT: vzeroupper
-; FALLBACK8-NEXT: retq
-;
-; FALLBACK9-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK9: # %bb.0:
-; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK9-NEXT: movzbl (%rsi), %eax
-; FALLBACK9-NEXT: movl %eax, %ecx
-; FALLBACK9-NEXT: shlb $5, %cl
-; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: andb $6, %al
-; FALLBACK9-NEXT: movzbl %al, %eax
-; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK9-NEXT: movq %rdi, %r8
-; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK9-NEXT: movq %rax, %r10
-; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK9-NEXT: shrq %cl, %rsi
-; FALLBACK9-NEXT: movq %r10, 8(%rdx)
-; FALLBACK9-NEXT: movq %r8, 16(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK9-NEXT: movq %r9, (%rdx)
-; FALLBACK9-NEXT: vzeroupper
-; FALLBACK9-NEXT: retq
-;
-; FALLBACK10-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK10: # %bb.0:
-; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT: movzbl (%rsi), %ecx
-; FALLBACK10-NEXT: movl %ecx, %eax
-; FALLBACK10-NEXT: shlb $5, %al
-; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: andb $6, %cl
-; FALLBACK10-NEXT: movzbl %cl, %ecx
-; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8
-; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11
-; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT: notb %al
-; FALLBACK10-NEXT: addq %rdi, %rdi
-; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT: orq %rsi, %rdi
-; FALLBACK10-NEXT: addq %rcx, %rcx
-; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT: orq %r9, %rcx
-; FALLBACK10-NEXT: addq %r8, %r8
-; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT: orq %r10, %rax
-; FALLBACK10-NEXT: movq %r11, 24(%rdx)
-; FALLBACK10-NEXT: movq %rax, 8(%rdx)
-; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT: movq %rdi, (%rdx)
-; FALLBACK10-NEXT: vzeroupper
-; FALLBACK10-NEXT: retq
-;
-; FALLBACK11-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK11: # %bb.0:
-; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK11-NEXT: movzbl (%rsi), %eax
-; FALLBACK11-NEXT: movl %eax, %ecx
-; FALLBACK11-NEXT: shlb $5, %cl
-; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: andb $6, %al
-; FALLBACK11-NEXT: movzbl %al, %eax
-; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK11-NEXT: movq %rdi, %r8
-; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK11-NEXT: movq %rax, %r10
-; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax
-; FALLBACK11-NEXT: movq %r10, 8(%rdx)
-; FALLBACK11-NEXT: movq %r8, 16(%rdx)
-; FALLBACK11-NEXT: movq %rax, 24(%rdx)
-; FALLBACK11-NEXT: movq %r9, (%rdx)
-; FALLBACK11-NEXT: vzeroupper
-; FALLBACK11-NEXT: retq
-;
-; FALLBACK12-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK12: # %bb.0:
-; FALLBACK12-NEXT: pushq %rbx
-; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK12-NEXT: movzbl (%rsi), %ecx
-; FALLBACK12-NEXT: movl %ecx, %eax
-; FALLBACK12-NEXT: shlb $5, %al
-; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: andb $6, %cl
-; FALLBACK12-NEXT: movzbl %cl, %r9d
-; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10
-; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: movl %eax, %esi
-; FALLBACK12-NEXT: notb %sil
-; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rdi
-; FALLBACK12-NEXT: orq %r10, %rdi
-; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10
-; FALLBACK12-NEXT: movq %r10, %r11
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r11
-; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9
-; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rbx
-; FALLBACK12-NEXT: orq %r11, %rbx
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r8
-; FALLBACK12-NEXT: addq %r10, %r10
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r10
-; FALLBACK12-NEXT: orq %r8, %r10
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r9
-; FALLBACK12-NEXT: movq %r9, 24(%rdx)
-; FALLBACK12-NEXT: movq %r10, 8(%rdx)
-; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK12-NEXT: movq %rdi, (%rdx)
-; FALLBACK12-NEXT: popq %rbx
-; FALLBACK12-NEXT: vzeroupper
-; FALLBACK12-NEXT: retq
-;
-; FALLBACK13-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK13: # %bb.0:
-; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK13-NEXT: movzbl (%rsi), %eax
-; FALLBACK13-NEXT: movl %eax, %ecx
-; FALLBACK13-NEXT: shlb $5, %cl
-; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: andb $6, %al
-; FALLBACK13-NEXT: movzbl %al, %eax
-; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK13-NEXT: movq %rdi, %r8
-; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK13-NEXT: movq %rax, %r10
-; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK13-NEXT: shrq %cl, %rsi
-; FALLBACK13-NEXT: movq %r10, 8(%rdx)
-; FALLBACK13-NEXT: movq %r8, 16(%rdx)
-; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK13-NEXT: movq %r9, (%rdx)
-; FALLBACK13-NEXT: vzeroupper
-; FALLBACK13-NEXT: retq
-;
-; FALLBACK14-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK14: # %bb.0:
-; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT: movzbl (%rsi), %ecx
-; FALLBACK14-NEXT: movl %ecx, %eax
-; FALLBACK14-NEXT: shlb $5, %al
-; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: andb $6, %cl
-; FALLBACK14-NEXT: movzbl %cl, %ecx
-; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8
-; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11
-; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT: notb %al
-; FALLBACK14-NEXT: addq %rdi, %rdi
-; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT: orq %rsi, %rdi
-; FALLBACK14-NEXT: addq %rcx, %rcx
-; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT: orq %r9, %rcx
-; FALLBACK14-NEXT: addq %r8, %r8
-; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT: orq %r10, %rax
-; FALLBACK14-NEXT: movq %r11, 24(%rdx)
-; FALLBACK14-NEXT: movq %rax, 8(%rdx)
-; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT: movq %rdi, (%rdx)
-; FALLBACK14-NEXT: vzeroupper
-; FALLBACK14-NEXT: retq
-;
-; FALLBACK15-LABEL: lshr_32bytes_dwordOff:
-; FALLBACK15: # %bb.0:
-; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK15-NEXT: movzbl (%rsi), %eax
-; FALLBACK15-NEXT: movl %eax, %ecx
-; FALLBACK15-NEXT: shlb $5, %cl
-; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: andb $6, %al
-; FALLBACK15-NEXT: movzbl %al, %eax
-; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK15-NEXT: movq %rdi, %r8
-; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK15-NEXT: movq %rax, %r10
-; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax
-; FALLBACK15-NEXT: movq %r10, 8(%rdx)
-; FALLBACK15-NEXT: movq %r8, 16(%rdx)
-; FALLBACK15-NEXT: movq %rax, 24(%rdx)
-; FALLBACK15-NEXT: movq %r9, (%rdx)
-; FALLBACK15-NEXT: vzeroupper
-; FALLBACK15-NEXT: retq
+; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9,4), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9,4), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_32bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %cl
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %cl, %r9d
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %al
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %cl
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %cl, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -72(%rsp,%rcx,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rcx,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rcx,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rcx,4), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rcx,%rcx), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rcx, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_32bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %al
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $6, %cl
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %cl, %r9d
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $6, %al
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -72(%rsp,%rax,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: lshr_32bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %al
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X86-SSE2-LABEL: lshr_32bytes_dwordOff:
; X86-SSE2: # %bb.0:
@@ -5922,1955 +4936,1495 @@ define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
}
define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; FALLBACK0-LABEL: shl_32bytes:
-; FALLBACK0: # %bb.0:
-; FALLBACK0-NEXT: pushq %rbx
-; FALLBACK0-NEXT: movq (%rdi), %rcx
-; FALLBACK0-NEXT: movq 8(%rdi), %r8
-; FALLBACK0-NEXT: movq 16(%rdi), %r9
-; FALLBACK0-NEXT: movq 24(%rdi), %rdi
-; FALLBACK0-NEXT: movzbl (%rsi), %esi
-; FALLBACK0-NEXT: leal (,%rsi,8), %eax
-; FALLBACK0-NEXT: xorps %xmm0, %xmm0
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: andb $24, %sil
-; FALLBACK0-NEXT: negb %sil
-; FALLBACK0-NEXT: movsbq %sil, %r10
-; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8
-; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi
-; FALLBACK0-NEXT: movq %rdi, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r11
-; FALLBACK0-NEXT: movl %eax, %esi
-; FALLBACK0-NEXT: notb %sil
-; FALLBACK0-NEXT: movq %r8, %r9
-; FALLBACK0-NEXT: shrq %r9
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r9
-; FALLBACK0-NEXT: orq %r11, %r9
-; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r11
-; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10
-; FALLBACK0-NEXT: movq %r10, %rbx
-; FALLBACK0-NEXT: shrq %rbx
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rbx
-; FALLBACK0-NEXT: orq %r11, %rbx
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r10
-; FALLBACK0-NEXT: shrq %rdi
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rdi
-; FALLBACK0-NEXT: orq %r10, %rdi
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r8
-; FALLBACK0-NEXT: movq %r8, (%rdx)
-; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK0-NEXT: movq %rbx, 24(%rdx)
-; FALLBACK0-NEXT: movq %r9, 8(%rdx)
-; FALLBACK0-NEXT: popq %rbx
-; FALLBACK0-NEXT: retq
-;
-; FALLBACK1-LABEL: shl_32bytes:
-; FALLBACK1: # %bb.0:
-; FALLBACK1-NEXT: movq (%rdi), %rax
-; FALLBACK1-NEXT: movq 8(%rdi), %r8
-; FALLBACK1-NEXT: movq 16(%rdi), %r9
-; FALLBACK1-NEXT: movq 24(%rdi), %rdi
-; FALLBACK1-NEXT: movzbl (%rsi), %esi
-; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK1-NEXT: xorps %xmm0, %xmm0
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: andb $24, %sil
-; FALLBACK1-NEXT: negb %sil
-; FALLBACK1-NEXT: movsbq %sil, %rax
-; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK1-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK1-NEXT: shldq %cl, %r8, %rax
-; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK1-NEXT: shlq %cl, %r8
-; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK1-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK1-NEXT: movq %r8, (%rdx)
-; FALLBACK1-NEXT: movq %rax, 8(%rdx)
-; FALLBACK1-NEXT: retq
-;
-; FALLBACK2-LABEL: shl_32bytes:
-; FALLBACK2: # %bb.0:
-; FALLBACK2-NEXT: movq (%rdi), %rcx
-; FALLBACK2-NEXT: movq 8(%rdi), %r8
-; FALLBACK2-NEXT: movq 16(%rdi), %r9
-; FALLBACK2-NEXT: movq 24(%rdi), %rdi
-; FALLBACK2-NEXT: movzbl (%rsi), %esi
-; FALLBACK2-NEXT: leal (,%rsi,8), %eax
-; FALLBACK2-NEXT: xorps %xmm0, %xmm0
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: andb $24, %sil
-; FALLBACK2-NEXT: negb %sil
-; FALLBACK2-NEXT: movsbq %sil, %rsi
-; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi
-; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8
-; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9
-; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi
-; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10
-; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11
-; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK2-NEXT: notb %al
-; FALLBACK2-NEXT: shrq %rdi
-; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT: orq %r8, %rdi
-; FALLBACK2-NEXT: shrq %rsi
-; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT: orq %r9, %rsi
-; FALLBACK2-NEXT: shrq %rcx
-; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax
-; FALLBACK2-NEXT: orq %r10, %rax
-; FALLBACK2-NEXT: movq %r11, (%rdx)
-; FALLBACK2-NEXT: movq %rax, 16(%rdx)
-; FALLBACK2-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
-; FALLBACK2-NEXT: retq
-;
-; FALLBACK3-LABEL: shl_32bytes:
-; FALLBACK3: # %bb.0:
-; FALLBACK3-NEXT: movq (%rdi), %rax
-; FALLBACK3-NEXT: movq 8(%rdi), %r8
-; FALLBACK3-NEXT: movq 16(%rdi), %r9
-; FALLBACK3-NEXT: movq 24(%rdi), %rdi
-; FALLBACK3-NEXT: movzbl (%rsi), %esi
-; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK3-NEXT: xorps %xmm0, %xmm0
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: andb $24, %sil
-; FALLBACK3-NEXT: negb %sil
-; FALLBACK3-NEXT: movsbq %sil, %rax
-; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK3-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK3-NEXT: shldq %cl, %r8, %rax
-; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx
-; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK3-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK3-NEXT: movq %rcx, (%rdx)
-; FALLBACK3-NEXT: movq %rax, 8(%rdx)
-; FALLBACK3-NEXT: retq
-;
-; FALLBACK4-LABEL: shl_32bytes:
-; FALLBACK4: # %bb.0:
-; FALLBACK4-NEXT: movups (%rdi), %xmm0
-; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK4-NEXT: movzbl (%rsi), %ecx
-; FALLBACK4-NEXT: leal (,%rcx,8), %eax
-; FALLBACK4-NEXT: xorps %xmm2, %xmm2
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: andb $24, %cl
-; FALLBACK4-NEXT: negb %cl
-; FALLBACK4-NEXT: movsbq %cl, %r8
-; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r9
-; FALLBACK4-NEXT: movl %eax, %esi
-; FALLBACK4-NEXT: notb %sil
-; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10
-; FALLBACK4-NEXT: movq %r10, %rdi
-; FALLBACK4-NEXT: shrq %rdi
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %rdi
-; FALLBACK4-NEXT: orq %r9, %rdi
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r10
-; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9
-; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8
-; FALLBACK4-NEXT: movq %r8, %r11
-; FALLBACK4-NEXT: shrq %r11
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r11
-; FALLBACK4-NEXT: orq %r10, %r11
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r8
-; FALLBACK4-NEXT: movq %r9, %r10
-; FALLBACK4-NEXT: shrq %r10
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r10
-; FALLBACK4-NEXT: orq %r8, %r10
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r9
-; FALLBACK4-NEXT: movq %r9, (%rdx)
-; FALLBACK4-NEXT: movq %r10, 8(%rdx)
-; FALLBACK4-NEXT: movq %r11, 16(%rdx)
-; FALLBACK4-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK4-NEXT: retq
-;
-; FALLBACK5-LABEL: shl_32bytes:
-; FALLBACK5: # %bb.0:
-; FALLBACK5-NEXT: movups (%rdi), %xmm0
-; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK5-NEXT: movzbl (%rsi), %eax
-; FALLBACK5-NEXT: leal (,%rax,8), %ecx
-; FALLBACK5-NEXT: xorps %xmm2, %xmm2
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: andb $24, %al
-; FALLBACK5-NEXT: negb %al
-; FALLBACK5-NEXT: movsbq %al, %rax
-; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK5-NEXT: movq %r8, %r9
-; FALLBACK5-NEXT: shlq %cl, %r9
-; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK5-NEXT: shldq %cl, %r8, %rax
-; FALLBACK5-NEXT: movq %rax, 8(%rdx)
-; FALLBACK5-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK5-NEXT: movq %r9, (%rdx)
-; FALLBACK5-NEXT: retq
-;
-; FALLBACK6-LABEL: shl_32bytes:
-; FALLBACK6: # %bb.0:
-; FALLBACK6-NEXT: movups (%rdi), %xmm0
-; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT: movzbl (%rsi), %ecx
-; FALLBACK6-NEXT: leal (,%rcx,8), %eax
-; FALLBACK6-NEXT: xorps %xmm2, %xmm2
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: andb $24, %cl
-; FALLBACK6-NEXT: negb %cl
-; FALLBACK6-NEXT: movsbq %cl, %rcx
-; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8
-; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9
-; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10
-; FALLBACK6-NEXT: shlxq %rax, %r9, %r11
-; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK6-NEXT: notb %al
-; FALLBACK6-NEXT: shrq %rdi
-; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT: orq %rsi, %rdi
-; FALLBACK6-NEXT: shrq %rcx
-; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT: orq %r8, %rcx
-; FALLBACK6-NEXT: shrq %r9
-; FALLBACK6-NEXT: shrxq %rax, %r9, %rax
-; FALLBACK6-NEXT: orq %r10, %rax
-; FALLBACK6-NEXT: movq %r11, (%rdx)
-; FALLBACK6-NEXT: movq %rax, 8(%rdx)
-; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK6-NEXT: retq
-;
-; FALLBACK7-LABEL: shl_32bytes:
-; FALLBACK7: # %bb.0:
-; FALLBACK7-NEXT: movups (%rdi), %xmm0
-; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK7-NEXT: movzbl (%rsi), %eax
-; FALLBACK7-NEXT: leal (,%rax,8), %ecx
-; FALLBACK7-NEXT: xorps %xmm2, %xmm2
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: andb $24, %al
-; FALLBACK7-NEXT: negb %al
-; FALLBACK7-NEXT: movsbq %al, %rax
-; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9
-; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK7-NEXT: shldq %cl, %r8, %rax
-; FALLBACK7-NEXT: movq %rax, 8(%rdx)
-; FALLBACK7-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK7-NEXT: movq %r9, (%rdx)
-; FALLBACK7-NEXT: retq
-;
-; FALLBACK8-LABEL: shl_32bytes:
-; FALLBACK8: # %bb.0:
-; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK8-NEXT: movzbl (%rsi), %ecx
-; FALLBACK8-NEXT: leal (,%rcx,8), %eax
-; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: andb $24, %cl
-; FALLBACK8-NEXT: negb %cl
-; FALLBACK8-NEXT: movsbq %cl, %r8
-; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r9
-; FALLBACK8-NEXT: movl %eax, %esi
-; FALLBACK8-NEXT: notb %sil
-; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10
-; FALLBACK8-NEXT: movq %r10, %rdi
-; FALLBACK8-NEXT: shrq %rdi
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %rdi
-; FALLBACK8-NEXT: orq %r9, %rdi
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r10
-; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9
-; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8
-; FALLBACK8-NEXT: movq %r8, %r11
-; FALLBACK8-NEXT: shrq %r11
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r11
-; FALLBACK8-NEXT: orq %r10, %r11
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r8
-; FALLBACK8-NEXT: movq %r9, %r10
-; FALLBACK8-NEXT: shrq %r10
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r10
-; FALLBACK8-NEXT: orq %r8, %r10
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r9
-; FALLBACK8-NEXT: movq %r9, (%rdx)
-; FALLBACK8-NEXT: movq %r10, 8(%rdx)
-; FALLBACK8-NEXT: movq %r11, 16(%rdx)
-; FALLBACK8-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK8-NEXT: vzeroupper
-; FALLBACK8-NEXT: retq
-;
-; FALLBACK9-LABEL: shl_32bytes:
-; FALLBACK9: # %bb.0:
-; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK9-NEXT: movzbl (%rsi), %eax
-; FALLBACK9-NEXT: leal (,%rax,8), %ecx
-; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: andb $24, %al
-; FALLBACK9-NEXT: negb %al
-; FALLBACK9-NEXT: movsbq %al, %rax
-; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK9-NEXT: movq %r8, %r9
-; FALLBACK9-NEXT: shlq %cl, %r9
-; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK9-NEXT: shldq %cl, %r8, %rax
-; FALLBACK9-NEXT: movq %rax, 8(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK9-NEXT: movq %r9, (%rdx)
-; FALLBACK9-NEXT: vzeroupper
-; FALLBACK9-NEXT: retq
-;
-; FALLBACK10-LABEL: shl_32bytes:
-; FALLBACK10: # %bb.0:
-; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT: movzbl (%rsi), %ecx
-; FALLBACK10-NEXT: leal (,%rcx,8), %eax
-; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: andb $24, %cl
-; FALLBACK10-NEXT: negb %cl
-; FALLBACK10-NEXT: movsbq %cl, %rcx
-; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8
-; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9
-; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10
-; FALLBACK10-NEXT: shlxq %rax, %r9, %r11
-; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT: notb %al
-; FALLBACK10-NEXT: shrq %rdi
-; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT: orq %rsi, %rdi
-; FALLBACK10-NEXT: shrq %rcx
-; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT: orq %r8, %rcx
-; FALLBACK10-NEXT: shrq %r9
-; FALLBACK10-NEXT: shrxq %rax, %r9, %rax
-; FALLBACK10-NEXT: orq %r10, %rax
-; FALLBACK10-NEXT: movq %r11, (%rdx)
-; FALLBACK10-NEXT: movq %rax, 8(%rdx)
-; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK10-NEXT: vzeroupper
-; FALLBACK10-NEXT: retq
-;
-; FALLBACK11-LABEL: shl_32bytes:
-; FALLBACK11: # %bb.0:
-; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK11-NEXT: movzbl (%rsi), %eax
-; FALLBACK11-NEXT: leal (,%rax,8), %ecx
-; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: andb $24, %al
-; FALLBACK11-NEXT: negb %al
-; FALLBACK11-NEXT: movsbq %al, %rax
-; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9
-; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK11-NEXT: shldq %cl, %r8, %rax
-; FALLBACK11-NEXT: movq %rax, 8(%rdx)
-; FALLBACK11-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK11-NEXT: movq %r9, (%rdx)
-; FALLBACK11-NEXT: vzeroupper
-; FALLBACK11-NEXT: retq
-;
-; FALLBACK12-LABEL: shl_32bytes:
-; FALLBACK12: # %bb.0:
-; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK12-NEXT: movzbl (%rsi), %ecx
-; FALLBACK12-NEXT: leal (,%rcx,8), %eax
-; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: andb $24, %cl
-; FALLBACK12-NEXT: negb %cl
-; FALLBACK12-NEXT: movsbq %cl, %r8
-; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r9
-; FALLBACK12-NEXT: movl %eax, %esi
-; FALLBACK12-NEXT: notb %sil
-; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10
-; FALLBACK12-NEXT: movq %r10, %rdi
-; FALLBACK12-NEXT: shrq %rdi
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %rdi
-; FALLBACK12-NEXT: orq %r9, %rdi
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r10
-; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9
-; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8
-; FALLBACK12-NEXT: movq %r8, %r11
-; FALLBACK12-NEXT: shrq %r11
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r11
-; FALLBACK12-NEXT: orq %r10, %r11
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r8
-; FALLBACK12-NEXT: movq %r9, %r10
-; FALLBACK12-NEXT: shrq %r10
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: orq %r8, %r10
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r9
-; FALLBACK12-NEXT: movq %r9, (%rdx)
-; FALLBACK12-NEXT: movq %r10, 8(%rdx)
-; FALLBACK12-NEXT: movq %r11, 16(%rdx)
-; FALLBACK12-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK12-NEXT: vzeroupper
-; FALLBACK12-NEXT: retq
-;
-; FALLBACK13-LABEL: shl_32bytes:
-; FALLBACK13: # %bb.0:
-; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK13-NEXT: movzbl (%rsi), %eax
-; FALLBACK13-NEXT: leal (,%rax,8), %ecx
-; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: andb $24, %al
-; FALLBACK13-NEXT: negb %al
-; FALLBACK13-NEXT: movsbq %al, %rax
-; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK13-NEXT: movq %r8, %r9
-; FALLBACK13-NEXT: shlq %cl, %r9
-; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK13-NEXT: shldq %cl, %r8, %rax
-; FALLBACK13-NEXT: movq %rax, 8(%rdx)
-; FALLBACK13-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK13-NEXT: movq %r9, (%rdx)
-; FALLBACK13-NEXT: vzeroupper
-; FALLBACK13-NEXT: retq
-;
-; FALLBACK14-LABEL: shl_32bytes:
-; FALLBACK14: # %bb.0:
-; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT: movzbl (%rsi), %ecx
-; FALLBACK14-NEXT: leal (,%rcx,8), %eax
-; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: andb $24, %cl
-; FALLBACK14-NEXT: negb %cl
-; FALLBACK14-NEXT: movsbq %cl, %rcx
-; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8
-; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9
-; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10
-; FALLBACK14-NEXT: shlxq %rax, %r9, %r11
-; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT: notb %al
-; FALLBACK14-NEXT: shrq %rdi
-; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT: orq %rsi, %rdi
-; FALLBACK14-NEXT: shrq %rcx
-; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT: orq %r8, %rcx
-; FALLBACK14-NEXT: shrq %r9
-; FALLBACK14-NEXT: shrxq %rax, %r9, %rax
-; FALLBACK14-NEXT: orq %r10, %rax
-; FALLBACK14-NEXT: movq %r11, (%rdx)
-; FALLBACK14-NEXT: movq %rax, 8(%rdx)
-; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK14-NEXT: vzeroupper
-; FALLBACK14-NEXT: retq
-;
-; FALLBACK15-LABEL: shl_32bytes:
-; FALLBACK15: # %bb.0:
-; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK15-NEXT: movzbl (%rsi), %eax
-; FALLBACK15-NEXT: leal (,%rax,8), %ecx
-; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: andb $24, %al
-; FALLBACK15-NEXT: negb %al
-; FALLBACK15-NEXT: movsbq %al, %rax
-; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9
-; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK15-NEXT: shldq %cl, %r8, %rax
-; FALLBACK15-NEXT: movq %rax, 8(%rdx)
-; FALLBACK15-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK15-NEXT: movq %r9, (%rdx)
-; FALLBACK15-NEXT: vzeroupper
-; FALLBACK15-NEXT: retq
-;
-; FALLBACK16-LABEL: shl_32bytes:
-; FALLBACK16: # %bb.0:
-; FALLBACK16-NEXT: pushl %ebp
-; FALLBACK16-NEXT: pushl %ebx
-; FALLBACK16-NEXT: pushl %edi
-; FALLBACK16-NEXT: pushl %esi
-; FALLBACK16-NEXT: subl $108, %esp
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT: movl (%ecx), %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 4(%ecx), %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 8(%ecx), %esi
-; FALLBACK16-NEXT: movl 12(%ecx), %edi
-; FALLBACK16-NEXT: movl 16(%ecx), %ebx
-; FALLBACK16-NEXT: movb (%eax), %ah
-; FALLBACK16-NEXT: movl 20(%ecx), %ebp
-; FALLBACK16-NEXT: movl 24(%ecx), %edx
-; FALLBACK16-NEXT: movl 28(%ecx), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movb %ah, %ch
-; FALLBACK16-NEXT: shlb $3, %ch
-; FALLBACK16-NEXT: xorps %xmm0, %xmm0
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: andb $28, %ah
-; FALLBACK16-NEXT: negb %ah
-; FALLBACK16-NEXT: movsbl %ah, %ebx
-; FALLBACK16-NEXT: movl 64(%esp,%ebx), %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 68(%esp,%ebx), %eax
-; FALLBACK16-NEXT: movl %eax, %esi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: movb %ch, %dl
-; FALLBACK16-NEXT: notb %dl
-; FALLBACK16-NEXT: shrl %edi
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: orl %esi, %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 76(%esp,%ebx), %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: movl 72(%esp,%ebx), %esi
-; FALLBACK16-NEXT: movl %esi, %ebp
-; FALLBACK16-NEXT: shrl %ebp
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: orl %edi, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: shrl %eax
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: orl %esi, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 84(%esp,%ebx), %esi
-; FALLBACK16-NEXT: movl %esi, %eax
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi
-; FALLBACK16-NEXT: movl %edi, %ebp
-; FALLBACK16-NEXT: shrl %ebp
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: orl %eax, %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: shrl %eax
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: orl %edi, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 92(%esp,%ebx), %eax
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: movl 88(%esp,%ebx), %edi
-; FALLBACK16-NEXT: movl %edi, %ebx
-; FALLBACK16-NEXT: shrl %ebx
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: orl %eax, %ebx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: shrl %esi
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %esi
-; FALLBACK16-NEXT: orl %edi, %esi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl %edx, (%eax)
-; FALLBACK16-NEXT: movl %esi, 24(%eax)
-; FALLBACK16-NEXT: movl %ebx, 28(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 16(%eax)
-; FALLBACK16-NEXT: movl %ebp, 20(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 8(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 12(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 4(%eax)
-; FALLBACK16-NEXT: addl $108, %esp
-; FALLBACK16-NEXT: popl %esi
-; FALLBACK16-NEXT: popl %edi
-; FALLBACK16-NEXT: popl %ebx
-; FALLBACK16-NEXT: popl %ebp
-; FALLBACK16-NEXT: retl
-;
-; FALLBACK17-LABEL: shl_32bytes:
-; FALLBACK17: # %bb.0:
-; FALLBACK17-NEXT: pushl %ebp
-; FALLBACK17-NEXT: pushl %ebx
-; FALLBACK17-NEXT: pushl %edi
-; FALLBACK17-NEXT: pushl %esi
-; FALLBACK17-NEXT: subl $92, %esp
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK17-NEXT: movl (%eax), %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 4(%eax), %edx
-; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: movl 8(%eax), %esi
-; FALLBACK17-NEXT: movl 12(%eax), %edi
-; FALLBACK17-NEXT: movl 16(%eax), %ebx
-; FALLBACK17-NEXT: movb (%ecx), %ch
-; FALLBACK17-NEXT: movl 20(%eax), %ebp
-; FALLBACK17-NEXT: movl 24(%eax), %edx
-; FALLBACK17-NEXT: movl 28(%eax), %eax
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movb %ch, %cl
-; FALLBACK17-NEXT: shlb $3, %cl
-; FALLBACK17-NEXT: xorps %xmm0, %xmm0
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: andb $28, %ch
-; FALLBACK17-NEXT: negb %ch
-; FALLBACK17-NEXT: movsbl %ch, %eax
-; FALLBACK17-NEXT: movl 56(%esp,%eax), %edx
-; FALLBACK17-NEXT: movl 60(%esp,%eax), %ebx
-; FALLBACK17-NEXT: movl %ebx, %esi
-; FALLBACK17-NEXT: shldl %cl, %edx, %esi
-; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 52(%esp,%eax), %esi
-; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: shldl %cl, %esi, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 64(%esp,%eax), %edi
-; FALLBACK17-NEXT: movl 68(%esp,%eax), %ebp
-; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK17-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebx
-; FALLBACK17-NEXT: movl 72(%esp,%eax), %edx
-; FALLBACK17-NEXT: movl 76(%esp,%eax), %esi
-; FALLBACK17-NEXT: shldl %cl, %edx, %esi
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: shldl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK17-NEXT: movl %edx, 24(%eax)
-; FALLBACK17-NEXT: movl %esi, 28(%eax)
-; FALLBACK17-NEXT: movl %edi, 16(%eax)
-; FALLBACK17-NEXT: movl %ebp, 20(%eax)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, 8(%eax)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, 12(%eax)
-; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
-; FALLBACK17-NEXT: shldl %cl, %ebx, %edx
-; FALLBACK17-NEXT: shll %cl, %ebx
-; FALLBACK17-NEXT: movl %ebx, (%eax)
-; FALLBACK17-NEXT: movl %edx, 4(%eax)
-; FALLBACK17-NEXT: addl $92, %esp
-; FALLBACK17-NEXT: popl %esi
-; FALLBACK17-NEXT: popl %edi
-; FALLBACK17-NEXT: popl %ebx
-; FALLBACK17-NEXT: popl %ebp
-; FALLBACK17-NEXT: retl
-;
-; FALLBACK18-LABEL: shl_32bytes:
-; FALLBACK18: # %bb.0:
-; FALLBACK18-NEXT: pushl %ebp
-; FALLBACK18-NEXT: pushl %ebx
-; FALLBACK18-NEXT: pushl %edi
-; FALLBACK18-NEXT: pushl %esi
-; FALLBACK18-NEXT: subl $108, %esp
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl (%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 4(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 8(%eax), %esi
-; FALLBACK18-NEXT: movl 12(%eax), %edi
-; FALLBACK18-NEXT: movl 16(%eax), %ebp
-; FALLBACK18-NEXT: movzbl (%ebx), %ebx
-; FALLBACK18-NEXT: movl 20(%eax), %edx
-; FALLBACK18-NEXT: movl 24(%eax), %ecx
-; FALLBACK18-NEXT: movl 28(%eax), %eax
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebx, %edx
-; FALLBACK18-NEXT: shlb $3, %dl
-; FALLBACK18-NEXT: xorps %xmm0, %xmm0
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: andb $28, %bl
-; FALLBACK18-NEXT: negb %bl
-; FALLBACK18-NEXT: movsbl %bl, %esi
-; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx
-; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, %eax, %edi
-; FALLBACK18-NEXT: movl %edx, %ecx
-; FALLBACK18-NEXT: notb %cl
-; FALLBACK18-NEXT: shrl %ebx
-; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx
-; FALLBACK18-NEXT: orl %edi, %ebx
-; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx
-; FALLBACK18-NEXT: movl %ebx, %edi
-; FALLBACK18-NEXT: shrl %edi
-; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax
-; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi
-; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: shrl %eax
-; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax
-; FALLBACK18-NEXT: orl %ebx, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx
-; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrl %ebx
-; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax
-; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx
-; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrl %edi
-; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi
-; FALLBACK18-NEXT: orl %eax, %edi
-; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp
-; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi
-; FALLBACK18-NEXT: shlxl %edx, %esi, %eax
-; FALLBACK18-NEXT: shrl %esi
-; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi
-; FALLBACK18-NEXT: orl %ebp, %esi
-; FALLBACK18-NEXT: shrl %ebx
-; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx
-; FALLBACK18-NEXT: orl %eax, %edx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, (%eax)
-; FALLBACK18-NEXT: movl %edx, 24(%eax)
-; FALLBACK18-NEXT: movl %esi, 28(%eax)
-; FALLBACK18-NEXT: movl %edi, 16(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 20(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 8(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 12(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 4(%eax)
-; FALLBACK18-NEXT: addl $108, %esp
-; FALLBACK18-NEXT: popl %esi
-; FALLBACK18-NEXT: popl %edi
-; FALLBACK18-NEXT: popl %ebx
-; FALLBACK18-NEXT: popl %ebp
-; FALLBACK18-NEXT: retl
-;
-; FALLBACK19-LABEL: shl_32bytes:
-; FALLBACK19: # %bb.0:
-; FALLBACK19-NEXT: pushl %ebp
-; FALLBACK19-NEXT: pushl %ebx
-; FALLBACK19-NEXT: pushl %edi
-; FALLBACK19-NEXT: pushl %esi
-; FALLBACK19-NEXT: subl $92, %esp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK19-NEXT: movl (%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 4(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 8(%ecx), %esi
-; FALLBACK19-NEXT: movl 12(%ecx), %edi
-; FALLBACK19-NEXT: movl 16(%ecx), %ebp
-; FALLBACK19-NEXT: movzbl (%ebx), %ebx
-; FALLBACK19-NEXT: movl 20(%ecx), %edx
-; FALLBACK19-NEXT: movl 24(%ecx), %eax
-; FALLBACK19-NEXT: movl 28(%ecx), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebx, %ecx
-; FALLBACK19-NEXT: shlb $3, %cl
-; FALLBACK19-NEXT: xorps %xmm0, %xmm0
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: andb $28, %bl
-; FALLBACK19-NEXT: negb %bl
-; FALLBACK19-NEXT: movsbl %bl, %eax
-; FALLBACK19-NEXT: movl 56(%esp,%eax), %edx
-; FALLBACK19-NEXT: movl 60(%esp,%eax), %esi
-; FALLBACK19-NEXT: movl %esi, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: shldl %cl, %edx, %esi
-; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 52(%esp,%eax), %ebx
-; FALLBACK19-NEXT: shldl %cl, %ebx, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 64(%esp,%eax), %edi
-; FALLBACK19-NEXT: movl 68(%esp,%eax), %ebp
-; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
-; FALLBACK19-NEXT: shldl %cl, %edx, %edi
-; FALLBACK19-NEXT: movl 48(%esp,%eax), %edx
-; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: movl 72(%esp,%eax), %edx
-; FALLBACK19-NEXT: movl 76(%esp,%eax), %esi
-; FALLBACK19-NEXT: shldl %cl, %edx, %esi
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: shldl %cl, %eax, %edx
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK19-NEXT: movl %edx, 24(%eax)
-; FALLBACK19-NEXT: movl %esi, 28(%eax)
-; FALLBACK19-NEXT: movl %edi, 16(%eax)
-; FALLBACK19-NEXT: movl %ebp, 20(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, 8(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, 12(%eax)
-; FALLBACK19-NEXT: movl (%esp), %esi # 4-byte Reload
-; FALLBACK19-NEXT: shlxl %ecx, %esi, %edx
-; FALLBACK19-NEXT: movl %edx, (%eax)
-; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK19-NEXT: shldl %cl, %esi, %ebx
-; FALLBACK19-NEXT: movl %ebx, 4(%eax)
-; FALLBACK19-NEXT: addl $92, %esp
-; FALLBACK19-NEXT: popl %esi
-; FALLBACK19-NEXT: popl %edi
-; FALLBACK19-NEXT: popl %ebx
-; FALLBACK19-NEXT: popl %ebp
-; FALLBACK19-NEXT: retl
-;
-; FALLBACK20-LABEL: shl_32bytes:
-; FALLBACK20: # %bb.0:
-; FALLBACK20-NEXT: pushl %ebp
-; FALLBACK20-NEXT: pushl %ebx
-; FALLBACK20-NEXT: pushl %edi
-; FALLBACK20-NEXT: pushl %esi
-; FALLBACK20-NEXT: subl $108, %esp
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: movups (%ecx), %xmm0
-; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK20-NEXT: movzbl (%eax), %ecx
-; FALLBACK20-NEXT: movb %cl, %dh
-; FALLBACK20-NEXT: shlb $3, %dh
-; FALLBACK20-NEXT: xorps %xmm2, %xmm2
-; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: andb $28, %cl
-; FALLBACK20-NEXT: negb %cl
-; FALLBACK20-NEXT: movsbl %cl, %ebx
-; FALLBACK20-NEXT: movl 84(%esp,%ebx), %edi
-; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shll %cl, %edi
-; FALLBACK20-NEXT: movb %dh, %dl
-; FALLBACK20-NEXT: notb %dl
-; FALLBACK20-NEXT: movl 80(%esp,%ebx), %esi
-; FALLBACK20-NEXT: movl %esi, %eax
-; FALLBACK20-NEXT: shrl %eax
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: orl %edi, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: movl %ebx, %edi
-; FALLBACK20-NEXT: movl 76(%esp,%ebx), %ebp
-; FALLBACK20-NEXT: movl %ebp, %eax
-; FALLBACK20-NEXT: shrl %eax
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: orl %esi, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shll %cl, %ebp
-; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ebx
-; FALLBACK20-NEXT: movl %ebx, %eax
-; FALLBACK20-NEXT: shrl %eax
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: orl %ebp, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 68(%esp,%edi), %ebp
-; FALLBACK20-NEXT: movl %ebp, %esi
-; FALLBACK20-NEXT: shrl %esi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %esi
-; FALLBACK20-NEXT: orl %ebx, %esi
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shll %cl, %ebp
-; FALLBACK20-NEXT: movl 64(%esp,%edi), %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: shrl %ebx
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: orl %ebp, %ebx
-; FALLBACK20-NEXT: movl 88(%esp,%edi), %ebp
-; FALLBACK20-NEXT: movl %ebp, %edi
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shll %cl, %edi
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: shrl %eax
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: orl %edi, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: movl 92(%esp,%eax), %edi
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shll %cl, %edi
-; FALLBACK20-NEXT: shrl %ebp
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: orl %edi, %ebp
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: shll %cl, %edx
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl %edx, (%eax)
-; FALLBACK20-NEXT: movl %ebp, 28(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 24(%eax)
-; FALLBACK20-NEXT: movl %ebx, 4(%eax)
-; FALLBACK20-NEXT: movl %esi, 8(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 12(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 16(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 20(%eax)
-; FALLBACK20-NEXT: addl $108, %esp
-; FALLBACK20-NEXT: popl %esi
-; FALLBACK20-NEXT: popl %edi
-; FALLBACK20-NEXT: popl %ebx
-; FALLBACK20-NEXT: popl %ebp
-; FALLBACK20-NEXT: retl
-;
-; FALLBACK21-LABEL: shl_32bytes:
-; FALLBACK21: # %bb.0:
-; FALLBACK21-NEXT: pushl %ebp
-; FALLBACK21-NEXT: pushl %ebx
-; FALLBACK21-NEXT: pushl %edi
-; FALLBACK21-NEXT: pushl %esi
-; FALLBACK21-NEXT: subl $92, %esp
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK21-NEXT: movups (%ecx), %xmm0
-; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK21-NEXT: movzbl (%eax), %eax
-; FALLBACK21-NEXT: movl %eax, %ecx
-; FALLBACK21-NEXT: shlb $3, %cl
-; FALLBACK21-NEXT: xorps %xmm2, %xmm2
-; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: andb $28, %al
-; FALLBACK21-NEXT: negb %al
-; FALLBACK21-NEXT: movsbl %al, %ebp
-; FALLBACK21-NEXT: movl 64(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl 68(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shldl %cl, %eax, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 60(%esp,%ebp), %edx
-; FALLBACK21-NEXT: shldl %cl, %edx, %eax
-; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edi
-; FALLBACK21-NEXT: shldl %cl, %edi, %edx
-; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK21-NEXT: movl 52(%esp,%ebp), %ebx
-; FALLBACK21-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK21-NEXT: movl 72(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl %edx, %eax
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK21-NEXT: shldl %cl, %esi, %eax
-; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
-; FALLBACK21-NEXT: movl 76(%esp,%ebp), %ebp
-; FALLBACK21-NEXT: shldl %cl, %edx, %ebp
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK21-NEXT: movl %ebp, 28(%edx)
-; FALLBACK21-NEXT: movl %eax, 24(%edx)
-; FALLBACK21-NEXT: movl %esi, %eax
-; FALLBACK21-NEXT: shll %cl, %eax
-; FALLBACK21-NEXT: shldl %cl, %esi, %ebx
-; FALLBACK21-NEXT: movl %ebx, 4(%edx)
-; FALLBACK21-NEXT: movl %edi, 8(%edx)
-; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload
-; FALLBACK21-NEXT: movl %ecx, 12(%edx)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK21-NEXT: movl %ecx, 16(%edx)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK21-NEXT: movl %ecx, 20(%edx)
-; FALLBACK21-NEXT: movl %eax, (%edx)
-; FALLBACK21-NEXT: addl $92, %esp
-; FALLBACK21-NEXT: popl %esi
-; FALLBACK21-NEXT: popl %edi
-; FALLBACK21-NEXT: popl %ebx
-; FALLBACK21-NEXT: popl %ebp
-; FALLBACK21-NEXT: retl
-;
-; FALLBACK22-LABEL: shl_32bytes:
-; FALLBACK22: # %bb.0:
-; FALLBACK22-NEXT: pushl %ebp
-; FALLBACK22-NEXT: pushl %ebx
-; FALLBACK22-NEXT: pushl %edi
-; FALLBACK22-NEXT: pushl %esi
-; FALLBACK22-NEXT: subl $108, %esp
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK22-NEXT: movups (%ecx), %xmm0
-; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK22-NEXT: movzbl (%eax), %ecx
-; FALLBACK22-NEXT: movl %ecx, %eax
-; FALLBACK22-NEXT: shlb $3, %al
-; FALLBACK22-NEXT: xorps %xmm2, %xmm2
-; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: andb $28, %cl
-; FALLBACK22-NEXT: negb %cl
-; FALLBACK22-NEXT: movsbl %cl, %edx
-; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx
-; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi
-; FALLBACK22-NEXT: shlxl %eax, %esi, %edi
-; FALLBACK22-NEXT: movl %eax, %ebx
-; FALLBACK22-NEXT: notb %bl
-; FALLBACK22-NEXT: shrl %esi
-; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT: orl %ecx, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx
-; FALLBACK22-NEXT: movl %ecx, %esi
-; FALLBACK22-NEXT: shrl %esi
-; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT: orl %edi, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx
-; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi
-; FALLBACK22-NEXT: movl %esi, %edi
-; FALLBACK22-NEXT: shrl %edi
-; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi
-; FALLBACK22-NEXT: orl %ecx, %edi
-; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx
-; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi
-; FALLBACK22-NEXT: movl %esi, %edi
-; FALLBACK22-NEXT: shrl %edi
-; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp
-; FALLBACK22-NEXT: orl %ecx, %ebp
-; FALLBACK22-NEXT: shlxl %eax, %esi, %edi
-; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi
-; FALLBACK22-NEXT: movl %esi, %ecx
-; FALLBACK22-NEXT: shrl %ecx
-; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT: orl %edi, %ecx
-; FALLBACK22-NEXT: shlxl %eax, %esi, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi
-; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx
-; FALLBACK22-NEXT: shlxl %eax, %edx, %esi
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: shrl %eax
-; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT: orl %esi, %eax
-; FALLBACK22-NEXT: shrl %edx
-; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx
-; FALLBACK22-NEXT: orl %edi, %edx
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK22-NEXT: movl %edi, (%esi)
-; FALLBACK22-NEXT: movl %edx, 28(%esi)
-; FALLBACK22-NEXT: movl %eax, 24(%esi)
-; FALLBACK22-NEXT: movl %ecx, 4(%esi)
-; FALLBACK22-NEXT: movl %ebp, 8(%esi)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, 12(%esi)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, 16(%esi)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, 20(%esi)
-; FALLBACK22-NEXT: addl $108, %esp
-; FALLBACK22-NEXT: popl %esi
-; FALLBACK22-NEXT: popl %edi
-; FALLBACK22-NEXT: popl %ebx
-; FALLBACK22-NEXT: popl %ebp
-; FALLBACK22-NEXT: retl
-;
-; FALLBACK23-LABEL: shl_32bytes:
-; FALLBACK23: # %bb.0:
-; FALLBACK23-NEXT: pushl %ebp
-; FALLBACK23-NEXT: pushl %ebx
-; FALLBACK23-NEXT: pushl %edi
-; FALLBACK23-NEXT: pushl %esi
-; FALLBACK23-NEXT: subl $92, %esp
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK23-NEXT: movups (%ecx), %xmm0
-; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK23-NEXT: movzbl (%eax), %eax
-; FALLBACK23-NEXT: movl %eax, %ecx
-; FALLBACK23-NEXT: shlb $3, %cl
-; FALLBACK23-NEXT: xorps %xmm2, %xmm2
-; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: andb $28, %al
-; FALLBACK23-NEXT: negb %al
-; FALLBACK23-NEXT: movsbl %al, %ebx
-; FALLBACK23-NEXT: movl 64(%esp,%ebx), %eax
-; FALLBACK23-NEXT: movl 68(%esp,%ebx), %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shldl %cl, %eax, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 60(%esp,%ebx), %edx
-; FALLBACK23-NEXT: shldl %cl, %edx, %eax
-; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 56(%esp,%ebx), %edi
-; FALLBACK23-NEXT: shldl %cl, %edi, %edx
-; FALLBACK23-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK23-NEXT: movl 52(%esp,%ebx), %ebp
-; FALLBACK23-NEXT: shldl %cl, %ebp, %edi
-; FALLBACK23-NEXT: movl 72(%esp,%ebx), %edx
-; FALLBACK23-NEXT: movl %edx, %eax
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK23-NEXT: shldl %cl, %esi, %eax
-; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
-; FALLBACK23-NEXT: movl 76(%esp,%ebx), %ebx
-; FALLBACK23-NEXT: shldl %cl, %edx, %ebx
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK23-NEXT: movl %ebx, 28(%edx)
-; FALLBACK23-NEXT: movl %eax, 24(%edx)
-; FALLBACK23-NEXT: shlxl %ecx, %esi, %eax
-; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK23-NEXT: shldl %cl, %esi, %ebp
-; FALLBACK23-NEXT: movl %ebp, 4(%edx)
-; FALLBACK23-NEXT: movl %edi, 8(%edx)
-; FALLBACK23-NEXT: movl (%esp), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 12(%edx)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 16(%edx)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 20(%edx)
-; FALLBACK23-NEXT: movl %eax, (%edx)
-; FALLBACK23-NEXT: addl $92, %esp
-; FALLBACK23-NEXT: popl %esi
-; FALLBACK23-NEXT: popl %edi
-; FALLBACK23-NEXT: popl %ebx
-; FALLBACK23-NEXT: popl %ebp
-; FALLBACK23-NEXT: retl
-;
-; FALLBACK24-LABEL: shl_32bytes:
-; FALLBACK24: # %bb.0:
-; FALLBACK24-NEXT: pushl %ebp
-; FALLBACK24-NEXT: pushl %ebx
-; FALLBACK24-NEXT: pushl %edi
-; FALLBACK24-NEXT: pushl %esi
-; FALLBACK24-NEXT: subl $108, %esp
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT: movzbl (%eax), %ecx
-; FALLBACK24-NEXT: movb %cl, %dh
-; FALLBACK24-NEXT: shlb $3, %dh
-; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: andb $28, %cl
-; FALLBACK24-NEXT: negb %cl
-; FALLBACK24-NEXT: movsbl %cl, %ebx
-; FALLBACK24-NEXT: movl 84(%esp,%ebx), %edi
-; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shll %cl, %edi
-; FALLBACK24-NEXT: movb %dh, %dl
-; FALLBACK24-NEXT: notb %dl
-; FALLBACK24-NEXT: movl 80(%esp,%ebx), %esi
-; FALLBACK24-NEXT: movl %esi, %eax
-; FALLBACK24-NEXT: shrl %eax
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: orl %edi, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: movl %ebx, %edi
-; FALLBACK24-NEXT: movl 76(%esp,%ebx), %ebp
-; FALLBACK24-NEXT: movl %ebp, %eax
-; FALLBACK24-NEXT: shrl %eax
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: orl %esi, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shll %cl, %ebp
-; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ebx
-; FALLBACK24-NEXT: movl %ebx, %eax
-; FALLBACK24-NEXT: shrl %eax
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: orl %ebp, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 68(%esp,%edi), %ebp
-; FALLBACK24-NEXT: movl %ebp, %esi
-; FALLBACK24-NEXT: shrl %esi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %esi
-; FALLBACK24-NEXT: orl %ebx, %esi
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shll %cl, %ebp
-; FALLBACK24-NEXT: movl 64(%esp,%edi), %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: shrl %ebx
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: orl %ebp, %ebx
-; FALLBACK24-NEXT: movl 88(%esp,%edi), %ebp
-; FALLBACK24-NEXT: movl %ebp, %edi
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shll %cl, %edi
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: shrl %eax
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: orl %edi, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: movl 92(%esp,%eax), %edi
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shll %cl, %edi
-; FALLBACK24-NEXT: shrl %ebp
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: orl %edi, %ebp
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: shll %cl, %edx
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl %edx, (%eax)
-; FALLBACK24-NEXT: movl %ebp, 28(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 24(%eax)
-; FALLBACK24-NEXT: movl %ebx, 4(%eax)
-; FALLBACK24-NEXT: movl %esi, 8(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 12(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 16(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 20(%eax)
-; FALLBACK24-NEXT: addl $108, %esp
-; FALLBACK24-NEXT: popl %esi
-; FALLBACK24-NEXT: popl %edi
-; FALLBACK24-NEXT: popl %ebx
-; FALLBACK24-NEXT: popl %ebp
-; FALLBACK24-NEXT: vzeroupper
-; FALLBACK24-NEXT: retl
-;
-; FALLBACK25-LABEL: shl_32bytes:
-; FALLBACK25: # %bb.0:
-; FALLBACK25-NEXT: pushl %ebp
-; FALLBACK25-NEXT: pushl %ebx
-; FALLBACK25-NEXT: pushl %edi
-; FALLBACK25-NEXT: pushl %esi
-; FALLBACK25-NEXT: subl $92, %esp
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK25-NEXT: movzbl (%eax), %eax
-; FALLBACK25-NEXT: movl %eax, %ecx
-; FALLBACK25-NEXT: shlb $3, %cl
-; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: andb $28, %al
-; FALLBACK25-NEXT: negb %al
-; FALLBACK25-NEXT: movsbl %al, %ebp
-; FALLBACK25-NEXT: movl 64(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl 68(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shldl %cl, %eax, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 60(%esp,%ebp), %edx
-; FALLBACK25-NEXT: shldl %cl, %edx, %eax
-; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edi
-; FALLBACK25-NEXT: shldl %cl, %edi, %edx
-; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK25-NEXT: movl 52(%esp,%ebp), %ebx
-; FALLBACK25-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK25-NEXT: movl 72(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl %edx, %eax
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK25-NEXT: shldl %cl, %esi, %eax
-; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
-; FALLBACK25-NEXT: movl 76(%esp,%ebp), %ebp
-; FALLBACK25-NEXT: shldl %cl, %edx, %ebp
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK25-NEXT: movl %ebp, 28(%edx)
-; FALLBACK25-NEXT: movl %eax, 24(%edx)
-; FALLBACK25-NEXT: movl %esi, %eax
-; FALLBACK25-NEXT: shll %cl, %eax
-; FALLBACK25-NEXT: shldl %cl, %esi, %ebx
-; FALLBACK25-NEXT: movl %ebx, 4(%edx)
-; FALLBACK25-NEXT: movl %edi, 8(%edx)
-; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload
-; FALLBACK25-NEXT: movl %ecx, 12(%edx)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK25-NEXT: movl %ecx, 16(%edx)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK25-NEXT: movl %ecx, 20(%edx)
-; FALLBACK25-NEXT: movl %eax, (%edx)
-; FALLBACK25-NEXT: addl $92, %esp
-; FALLBACK25-NEXT: popl %esi
-; FALLBACK25-NEXT: popl %edi
-; FALLBACK25-NEXT: popl %ebx
-; FALLBACK25-NEXT: popl %ebp
-; FALLBACK25-NEXT: vzeroupper
-; FALLBACK25-NEXT: retl
-;
-; FALLBACK26-LABEL: shl_32bytes:
-; FALLBACK26: # %bb.0:
-; FALLBACK26-NEXT: pushl %ebp
-; FALLBACK26-NEXT: pushl %ebx
-; FALLBACK26-NEXT: pushl %edi
-; FALLBACK26-NEXT: pushl %esi
-; FALLBACK26-NEXT: subl $108, %esp
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK26-NEXT: movzbl (%eax), %ecx
-; FALLBACK26-NEXT: movl %ecx, %eax
-; FALLBACK26-NEXT: shlb $3, %al
-; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: andb $28, %cl
-; FALLBACK26-NEXT: negb %cl
-; FALLBACK26-NEXT: movsbl %cl, %edx
-; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx
-; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi
-; FALLBACK26-NEXT: shlxl %eax, %esi, %edi
-; FALLBACK26-NEXT: movl %eax, %ebx
-; FALLBACK26-NEXT: notb %bl
-; FALLBACK26-NEXT: shrl %esi
-; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT: orl %ecx, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx
-; FALLBACK26-NEXT: movl %ecx, %esi
-; FALLBACK26-NEXT: shrl %esi
-; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT: orl %edi, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx
-; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi
-; FALLBACK26-NEXT: movl %esi, %edi
-; FALLBACK26-NEXT: shrl %edi
-; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi
-; FALLBACK26-NEXT: orl %ecx, %edi
-; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx
-; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi
-; FALLBACK26-NEXT: movl %esi, %edi
-; FALLBACK26-NEXT: shrl %edi
-; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp
-; FALLBACK26-NEXT: orl %ecx, %ebp
-; FALLBACK26-NEXT: shlxl %eax, %esi, %edi
-; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi
-; FALLBACK26-NEXT: movl %esi, %ecx
-; FALLBACK26-NEXT: shrl %ecx
-; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT: orl %edi, %ecx
-; FALLBACK26-NEXT: shlxl %eax, %esi, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi
-; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx
-; FALLBACK26-NEXT: shlxl %eax, %edx, %esi
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: shrl %eax
-; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT: orl %esi, %eax
-; FALLBACK26-NEXT: shrl %edx
-; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx
-; FALLBACK26-NEXT: orl %edi, %edx
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK26-NEXT: movl %edi, (%esi)
-; FALLBACK26-NEXT: movl %edx, 28(%esi)
-; FALLBACK26-NEXT: movl %eax, 24(%esi)
-; FALLBACK26-NEXT: movl %ecx, 4(%esi)
-; FALLBACK26-NEXT: movl %ebp, 8(%esi)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 12(%esi)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 16(%esi)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 20(%esi)
-; FALLBACK26-NEXT: addl $108, %esp
-; FALLBACK26-NEXT: popl %esi
-; FALLBACK26-NEXT: popl %edi
-; FALLBACK26-NEXT: popl %ebx
-; FALLBACK26-NEXT: popl %ebp
-; FALLBACK26-NEXT: vzeroupper
-; FALLBACK26-NEXT: retl
-;
-; FALLBACK27-LABEL: shl_32bytes:
-; FALLBACK27: # %bb.0:
-; FALLBACK27-NEXT: pushl %ebp
-; FALLBACK27-NEXT: pushl %ebx
-; FALLBACK27-NEXT: pushl %edi
-; FALLBACK27-NEXT: pushl %esi
-; FALLBACK27-NEXT: subl $92, %esp
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK27-NEXT: movzbl (%eax), %eax
-; FALLBACK27-NEXT: movl %eax, %ecx
-; FALLBACK27-NEXT: shlb $3, %cl
-; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: andb $28, %al
-; FALLBACK27-NEXT: negb %al
-; FALLBACK27-NEXT: movsbl %al, %ebx
-; FALLBACK27-NEXT: movl 64(%esp,%ebx), %eax
-; FALLBACK27-NEXT: movl 68(%esp,%ebx), %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shldl %cl, %eax, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 60(%esp,%ebx), %edx
-; FALLBACK27-NEXT: shldl %cl, %edx, %eax
-; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 56(%esp,%ebx), %edi
-; FALLBACK27-NEXT: shldl %cl, %edi, %edx
-; FALLBACK27-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK27-NEXT: movl 52(%esp,%ebx), %ebp
-; FALLBACK27-NEXT: shldl %cl, %ebp, %edi
-; FALLBACK27-NEXT: movl 72(%esp,%ebx), %edx
-; FALLBACK27-NEXT: movl %edx, %eax
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK27-NEXT: shldl %cl, %esi, %eax
-; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
-; FALLBACK27-NEXT: movl 76(%esp,%ebx), %ebx
-; FALLBACK27-NEXT: shldl %cl, %edx, %ebx
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK27-NEXT: movl %ebx, 28(%edx)
-; FALLBACK27-NEXT: movl %eax, 24(%edx)
-; FALLBACK27-NEXT: shlxl %ecx, %esi, %eax
-; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK27-NEXT: shldl %cl, %esi, %ebp
-; FALLBACK27-NEXT: movl %ebp, 4(%edx)
-; FALLBACK27-NEXT: movl %edi, 8(%edx)
-; FALLBACK27-NEXT: movl (%esp), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 12(%edx)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 16(%edx)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 20(%edx)
-; FALLBACK27-NEXT: movl %eax, (%edx)
-; FALLBACK27-NEXT: addl $92, %esp
-; FALLBACK27-NEXT: popl %esi
-; FALLBACK27-NEXT: popl %edi
-; FALLBACK27-NEXT: popl %ebx
-; FALLBACK27-NEXT: popl %ebp
-; FALLBACK27-NEXT: vzeroupper
-; FALLBACK27-NEXT: retl
-;
-; FALLBACK28-LABEL: shl_32bytes:
-; FALLBACK28: # %bb.0:
-; FALLBACK28-NEXT: pushl %ebp
-; FALLBACK28-NEXT: pushl %ebx
-; FALLBACK28-NEXT: pushl %edi
-; FALLBACK28-NEXT: pushl %esi
-; FALLBACK28-NEXT: subl $108, %esp
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK28-NEXT: movzbl (%eax), %ecx
-; FALLBACK28-NEXT: movb %cl, %dh
-; FALLBACK28-NEXT: shlb $3, %dh
-; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: andb $28, %cl
-; FALLBACK28-NEXT: negb %cl
-; FALLBACK28-NEXT: movsbl %cl, %ebx
-; FALLBACK28-NEXT: movl 84(%esp,%ebx), %edi
-; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shll %cl, %edi
-; FALLBACK28-NEXT: movb %dh, %dl
-; FALLBACK28-NEXT: notb %dl
-; FALLBACK28-NEXT: movl 80(%esp,%ebx), %esi
-; FALLBACK28-NEXT: movl %esi, %eax
-; FALLBACK28-NEXT: shrl %eax
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: orl %edi, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: movl %ebx, %edi
-; FALLBACK28-NEXT: movl 76(%esp,%ebx), %ebp
-; FALLBACK28-NEXT: movl %ebp, %eax
-; FALLBACK28-NEXT: shrl %eax
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: orl %esi, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shll %cl, %ebp
-; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ebx
-; FALLBACK28-NEXT: movl %ebx, %eax
-; FALLBACK28-NEXT: shrl %eax
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: orl %ebp, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 68(%esp,%edi), %ebp
-; FALLBACK28-NEXT: movl %ebp, %esi
-; FALLBACK28-NEXT: shrl %esi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %esi
-; FALLBACK28-NEXT: orl %ebx, %esi
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shll %cl, %ebp
-; FALLBACK28-NEXT: movl 64(%esp,%edi), %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: shrl %ebx
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: orl %ebp, %ebx
-; FALLBACK28-NEXT: movl 88(%esp,%edi), %ebp
-; FALLBACK28-NEXT: movl %ebp, %edi
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shll %cl, %edi
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: shrl %eax
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: orl %edi, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: movl 92(%esp,%eax), %edi
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shll %cl, %edi
-; FALLBACK28-NEXT: shrl %ebp
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: orl %edi, %ebp
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: shll %cl, %edx
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl %edx, (%eax)
-; FALLBACK28-NEXT: movl %ebp, 28(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 24(%eax)
-; FALLBACK28-NEXT: movl %ebx, 4(%eax)
-; FALLBACK28-NEXT: movl %esi, 8(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 12(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 16(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 20(%eax)
-; FALLBACK28-NEXT: addl $108, %esp
-; FALLBACK28-NEXT: popl %esi
-; FALLBACK28-NEXT: popl %edi
-; FALLBACK28-NEXT: popl %ebx
-; FALLBACK28-NEXT: popl %ebp
-; FALLBACK28-NEXT: vzeroupper
-; FALLBACK28-NEXT: retl
-;
-; FALLBACK29-LABEL: shl_32bytes:
-; FALLBACK29: # %bb.0:
-; FALLBACK29-NEXT: pushl %ebp
-; FALLBACK29-NEXT: pushl %ebx
-; FALLBACK29-NEXT: pushl %edi
-; FALLBACK29-NEXT: pushl %esi
-; FALLBACK29-NEXT: subl $92, %esp
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK29-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK29-NEXT: movzbl (%eax), %eax
-; FALLBACK29-NEXT: movl %eax, %ecx
-; FALLBACK29-NEXT: shlb $3, %cl
-; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: andb $28, %al
-; FALLBACK29-NEXT: negb %al
-; FALLBACK29-NEXT: movsbl %al, %ebp
-; FALLBACK29-NEXT: movl 64(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl 68(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shldl %cl, %eax, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 60(%esp,%ebp), %edx
-; FALLBACK29-NEXT: shldl %cl, %edx, %eax
-; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edi
-; FALLBACK29-NEXT: shldl %cl, %edi, %edx
-; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK29-NEXT: movl 52(%esp,%ebp), %ebx
-; FALLBACK29-NEXT: shldl %cl, %ebx, %edi
-; FALLBACK29-NEXT: movl 72(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl %edx, %eax
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK29-NEXT: shldl %cl, %esi, %eax
-; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
-; FALLBACK29-NEXT: movl 76(%esp,%ebp), %ebp
-; FALLBACK29-NEXT: shldl %cl, %edx, %ebp
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK29-NEXT: movl %ebp, 28(%edx)
-; FALLBACK29-NEXT: movl %eax, 24(%edx)
-; FALLBACK29-NEXT: movl %esi, %eax
-; FALLBACK29-NEXT: shll %cl, %eax
-; FALLBACK29-NEXT: shldl %cl, %esi, %ebx
-; FALLBACK29-NEXT: movl %ebx, 4(%edx)
-; FALLBACK29-NEXT: movl %edi, 8(%edx)
-; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload
-; FALLBACK29-NEXT: movl %ecx, 12(%edx)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK29-NEXT: movl %ecx, 16(%edx)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK29-NEXT: movl %ecx, 20(%edx)
-; FALLBACK29-NEXT: movl %eax, (%edx)
-; FALLBACK29-NEXT: addl $92, %esp
-; FALLBACK29-NEXT: popl %esi
-; FALLBACK29-NEXT: popl %edi
-; FALLBACK29-NEXT: popl %ebx
-; FALLBACK29-NEXT: popl %ebp
-; FALLBACK29-NEXT: vzeroupper
-; FALLBACK29-NEXT: retl
-;
-; FALLBACK30-LABEL: shl_32bytes:
-; FALLBACK30: # %bb.0:
-; FALLBACK30-NEXT: pushl %ebp
-; FALLBACK30-NEXT: pushl %ebx
-; FALLBACK30-NEXT: pushl %edi
-; FALLBACK30-NEXT: pushl %esi
-; FALLBACK30-NEXT: subl $108, %esp
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK30-NEXT: movzbl (%eax), %ecx
-; FALLBACK30-NEXT: movl %ecx, %eax
-; FALLBACK30-NEXT: shlb $3, %al
-; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: andb $28, %cl
-; FALLBACK30-NEXT: negb %cl
-; FALLBACK30-NEXT: movsbl %cl, %edx
-; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx
-; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi
-; FALLBACK30-NEXT: shlxl %eax, %esi, %edi
-; FALLBACK30-NEXT: movl %eax, %ebx
-; FALLBACK30-NEXT: notb %bl
-; FALLBACK30-NEXT: shrl %esi
-; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT: orl %ecx, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx
-; FALLBACK30-NEXT: movl %ecx, %esi
-; FALLBACK30-NEXT: shrl %esi
-; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT: orl %edi, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx
-; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi
-; FALLBACK30-NEXT: movl %esi, %edi
-; FALLBACK30-NEXT: shrl %edi
-; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi
-; FALLBACK30-NEXT: orl %ecx, %edi
-; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx
-; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi
-; FALLBACK30-NEXT: movl %esi, %edi
-; FALLBACK30-NEXT: shrl %edi
-; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp
-; FALLBACK30-NEXT: orl %ecx, %ebp
-; FALLBACK30-NEXT: shlxl %eax, %esi, %edi
-; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi
-; FALLBACK30-NEXT: movl %esi, %ecx
-; FALLBACK30-NEXT: shrl %ecx
-; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT: orl %edi, %ecx
-; FALLBACK30-NEXT: shlxl %eax, %esi, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi
-; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx
-; FALLBACK30-NEXT: shlxl %eax, %edx, %esi
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: shrl %eax
-; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT: orl %esi, %eax
-; FALLBACK30-NEXT: shrl %edx
-; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx
-; FALLBACK30-NEXT: orl %edi, %edx
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK30-NEXT: movl %edi, (%esi)
-; FALLBACK30-NEXT: movl %edx, 28(%esi)
-; FALLBACK30-NEXT: movl %eax, 24(%esi)
-; FALLBACK30-NEXT: movl %ecx, 4(%esi)
-; FALLBACK30-NEXT: movl %ebp, 8(%esi)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 12(%esi)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 16(%esi)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 20(%esi)
-; FALLBACK30-NEXT: addl $108, %esp
-; FALLBACK30-NEXT: popl %esi
-; FALLBACK30-NEXT: popl %edi
-; FALLBACK30-NEXT: popl %ebx
-; FALLBACK30-NEXT: popl %ebp
-; FALLBACK30-NEXT: vzeroupper
-; FALLBACK30-NEXT: retl
-;
-; FALLBACK31-LABEL: shl_32bytes:
-; FALLBACK31: # %bb.0:
-; FALLBACK31-NEXT: pushl %ebp
-; FALLBACK31-NEXT: pushl %ebx
-; FALLBACK31-NEXT: pushl %edi
-; FALLBACK31-NEXT: pushl %esi
-; FALLBACK31-NEXT: subl $92, %esp
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK31-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK31-NEXT: movzbl (%eax), %eax
-; FALLBACK31-NEXT: movl %eax, %ecx
-; FALLBACK31-NEXT: shlb $3, %cl
-; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: andb $28, %al
-; FALLBACK31-NEXT: negb %al
-; FALLBACK31-NEXT: movsbl %al, %ebx
-; FALLBACK31-NEXT: movl 64(%esp,%ebx), %eax
-; FALLBACK31-NEXT: movl 68(%esp,%ebx), %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shldl %cl, %eax, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 60(%esp,%ebx), %edx
-; FALLBACK31-NEXT: shldl %cl, %edx, %eax
-; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 56(%esp,%ebx), %edi
-; FALLBACK31-NEXT: shldl %cl, %edi, %edx
-; FALLBACK31-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK31-NEXT: movl 52(%esp,%ebx), %ebp
-; FALLBACK31-NEXT: shldl %cl, %ebp, %edi
-; FALLBACK31-NEXT: movl 72(%esp,%ebx), %edx
-; FALLBACK31-NEXT: movl %edx, %eax
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK31-NEXT: shldl %cl, %esi, %eax
-; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
-; FALLBACK31-NEXT: movl 76(%esp,%ebx), %ebx
-; FALLBACK31-NEXT: shldl %cl, %edx, %ebx
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK31-NEXT: movl %ebx, 28(%edx)
-; FALLBACK31-NEXT: movl %eax, 24(%edx)
-; FALLBACK31-NEXT: shlxl %ecx, %esi, %eax
-; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK31-NEXT: shldl %cl, %esi, %ebp
-; FALLBACK31-NEXT: movl %ebp, 4(%edx)
-; FALLBACK31-NEXT: movl %edi, 8(%edx)
-; FALLBACK31-NEXT: movl (%esp), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 12(%edx)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 16(%edx)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 20(%edx)
-; FALLBACK31-NEXT: movl %eax, (%edx)
-; FALLBACK31-NEXT: addl $92, %esp
-; FALLBACK31-NEXT: popl %esi
-; FALLBACK31-NEXT: popl %edi
-; FALLBACK31-NEXT: popl %ebx
-; FALLBACK31-NEXT: popl %ebp
-; FALLBACK31-NEXT: vzeroupper
-; FALLBACK31-NEXT: retl
+; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes:
+; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r10), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r10), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r10), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r10), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rcx, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes:
+; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbq %al, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %sil, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rsi, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rsi, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %al, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes:
+; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rcx,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movsbq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbq %al, %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %sil
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %sil, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rsi, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rsi, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r9, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %al, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb (%eax), %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ah, %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %ah
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ah, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%ebx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%ebx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb (%ecx), %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %ch
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbl %ch, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%eax), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 28(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 16(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 20(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 4(%eax)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %bl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ebx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, 92(%esp,%ecx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %edx, %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %bl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %bl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbl %bl, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%eax), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes:
+; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %cl, %dh
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %dh
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %dl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%eax), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbl %al, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 28(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%edx)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %dl, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%edx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%edx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%edx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, 92(%esp,%edx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 4(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbl %al, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebp, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 28(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 4(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 8(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, (%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes:
+; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %cl, %dh
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %dh
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movsbl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%ebx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %dl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebx), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%eax), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbl %al, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %ebx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %edx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 28(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 24(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldl %cl, %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 4(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 8(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%edx)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %dl, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%edx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, 92(%esp,%edx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 28(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 4(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 16(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 20(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %al
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbl %al, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %ebp, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %esi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 28(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 24(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ecx, %esi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 4(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 8(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 16(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 20(%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, (%edx)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
%bitOff = shl i256 %byteOff, 3
@@ -7880,617 +6434,472 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
}
define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
-; FALLBACK0-LABEL: shl_32bytes_dwordOff:
-; FALLBACK0: # %bb.0:
-; FALLBACK0-NEXT: pushq %rbx
-; FALLBACK0-NEXT: movq (%rdi), %rcx
-; FALLBACK0-NEXT: movq 8(%rdi), %r8
-; FALLBACK0-NEXT: movq 16(%rdi), %r9
-; FALLBACK0-NEXT: movq 24(%rdi), %rdi
-; FALLBACK0-NEXT: movzbl (%rsi), %esi
-; FALLBACK0-NEXT: movl %esi, %eax
-; FALLBACK0-NEXT: shlb $5, %al
-; FALLBACK0-NEXT: xorps %xmm0, %xmm0
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: shlb $2, %sil
-; FALLBACK0-NEXT: andb $24, %sil
-; FALLBACK0-NEXT: negb %sil
-; FALLBACK0-NEXT: movsbq %sil, %r10
-; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8
-; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi
-; FALLBACK0-NEXT: movq %rdi, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r11
-; FALLBACK0-NEXT: movl %eax, %esi
-; FALLBACK0-NEXT: notb %sil
-; FALLBACK0-NEXT: movq %r8, %r9
-; FALLBACK0-NEXT: shrq %r9
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r9
-; FALLBACK0-NEXT: orq %r11, %r9
-; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r11
-; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10
-; FALLBACK0-NEXT: movq %r10, %rbx
-; FALLBACK0-NEXT: shrq %rbx
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rbx
-; FALLBACK0-NEXT: orq %r11, %rbx
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r10
-; FALLBACK0-NEXT: shrq %rdi
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rdi
-; FALLBACK0-NEXT: orq %r10, %rdi
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r8
-; FALLBACK0-NEXT: movq %r8, (%rdx)
-; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK0-NEXT: movq %rbx, 24(%rdx)
-; FALLBACK0-NEXT: movq %r9, 8(%rdx)
-; FALLBACK0-NEXT: popq %rbx
-; FALLBACK0-NEXT: retq
-;
-; FALLBACK1-LABEL: shl_32bytes_dwordOff:
-; FALLBACK1: # %bb.0:
-; FALLBACK1-NEXT: movq (%rdi), %rax
-; FALLBACK1-NEXT: movq 8(%rdi), %r8
-; FALLBACK1-NEXT: movq 16(%rdi), %r9
-; FALLBACK1-NEXT: movq 24(%rdi), %rdi
-; FALLBACK1-NEXT: movzbl (%rsi), %esi
-; FALLBACK1-NEXT: movl %esi, %ecx
-; FALLBACK1-NEXT: shlb $5, %cl
-; FALLBACK1-NEXT: xorps %xmm0, %xmm0
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: shlb $2, %sil
-; FALLBACK1-NEXT: andb $24, %sil
-; FALLBACK1-NEXT: negb %sil
-; FALLBACK1-NEXT: movsbq %sil, %rax
-; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK1-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK1-NEXT: shldq %cl, %r8, %rax
-; FALLBACK1-NEXT: shlq %cl, %r8
-; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK1-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK1-NEXT: movq %r8, (%rdx)
-; FALLBACK1-NEXT: movq %rax, 8(%rdx)
-; FALLBACK1-NEXT: retq
-;
-; FALLBACK2-LABEL: shl_32bytes_dwordOff:
-; FALLBACK2: # %bb.0:
-; FALLBACK2-NEXT: movq (%rdi), %rcx
-; FALLBACK2-NEXT: movq 8(%rdi), %r8
-; FALLBACK2-NEXT: movq 16(%rdi), %r9
-; FALLBACK2-NEXT: movq 24(%rdi), %rdi
-; FALLBACK2-NEXT: movzbl (%rsi), %esi
-; FALLBACK2-NEXT: movl %esi, %eax
-; FALLBACK2-NEXT: shlb $5, %al
-; FALLBACK2-NEXT: xorps %xmm0, %xmm0
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: shlb $2, %sil
-; FALLBACK2-NEXT: andb $24, %sil
-; FALLBACK2-NEXT: negb %sil
-; FALLBACK2-NEXT: movsbq %sil, %rsi
-; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi
-; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8
-; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9
-; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi
-; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10
-; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11
-; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK2-NEXT: notb %al
-; FALLBACK2-NEXT: shrq %rdi
-; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT: orq %r8, %rdi
-; FALLBACK2-NEXT: shrq %rsi
-; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT: orq %r9, %rsi
-; FALLBACK2-NEXT: shrq %rcx
-; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax
-; FALLBACK2-NEXT: orq %r10, %rax
-; FALLBACK2-NEXT: movq %r11, (%rdx)
-; FALLBACK2-NEXT: movq %rax, 16(%rdx)
-; FALLBACK2-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
-; FALLBACK2-NEXT: retq
-;
-; FALLBACK3-LABEL: shl_32bytes_dwordOff:
-; FALLBACK3: # %bb.0:
-; FALLBACK3-NEXT: movq (%rdi), %rax
-; FALLBACK3-NEXT: movq 8(%rdi), %r8
-; FALLBACK3-NEXT: movq 16(%rdi), %r9
-; FALLBACK3-NEXT: movq 24(%rdi), %rdi
-; FALLBACK3-NEXT: movzbl (%rsi), %esi
-; FALLBACK3-NEXT: movl %esi, %ecx
-; FALLBACK3-NEXT: shlb $5, %cl
-; FALLBACK3-NEXT: xorps %xmm0, %xmm0
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: shlb $2, %sil
-; FALLBACK3-NEXT: andb $24, %sil
-; FALLBACK3-NEXT: negb %sil
-; FALLBACK3-NEXT: movsbq %sil, %rax
-; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK3-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK3-NEXT: shldq %cl, %r8, %rax
-; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx
-; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK3-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK3-NEXT: movq %rcx, (%rdx)
-; FALLBACK3-NEXT: movq %rax, 8(%rdx)
-; FALLBACK3-NEXT: retq
-;
-; FALLBACK4-LABEL: shl_32bytes_dwordOff:
-; FALLBACK4: # %bb.0:
-; FALLBACK4-NEXT: movups (%rdi), %xmm0
-; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK4-NEXT: movzbl (%rsi), %ecx
-; FALLBACK4-NEXT: movl %ecx, %eax
-; FALLBACK4-NEXT: shlb $5, %al
-; FALLBACK4-NEXT: xorps %xmm2, %xmm2
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: shlb $2, %cl
-; FALLBACK4-NEXT: andb $24, %cl
-; FALLBACK4-NEXT: negb %cl
-; FALLBACK4-NEXT: movsbq %cl, %r8
-; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r9
-; FALLBACK4-NEXT: movl %eax, %esi
-; FALLBACK4-NEXT: notb %sil
-; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10
-; FALLBACK4-NEXT: movq %r10, %rdi
-; FALLBACK4-NEXT: shrq %rdi
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %rdi
-; FALLBACK4-NEXT: orq %r9, %rdi
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r10
-; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9
-; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8
-; FALLBACK4-NEXT: movq %r8, %r11
-; FALLBACK4-NEXT: shrq %r11
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r11
-; FALLBACK4-NEXT: orq %r10, %r11
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r8
-; FALLBACK4-NEXT: movq %r9, %r10
-; FALLBACK4-NEXT: shrq %r10
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r10
-; FALLBACK4-NEXT: orq %r8, %r10
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r9
-; FALLBACK4-NEXT: movq %r9, (%rdx)
-; FALLBACK4-NEXT: movq %r10, 8(%rdx)
-; FALLBACK4-NEXT: movq %r11, 16(%rdx)
-; FALLBACK4-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK4-NEXT: retq
-;
-; FALLBACK5-LABEL: shl_32bytes_dwordOff:
-; FALLBACK5: # %bb.0:
-; FALLBACK5-NEXT: movups (%rdi), %xmm0
-; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK5-NEXT: movzbl (%rsi), %eax
-; FALLBACK5-NEXT: movl %eax, %ecx
-; FALLBACK5-NEXT: shlb $5, %cl
-; FALLBACK5-NEXT: xorps %xmm2, %xmm2
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: shlb $2, %al
-; FALLBACK5-NEXT: andb $24, %al
-; FALLBACK5-NEXT: negb %al
-; FALLBACK5-NEXT: movsbq %al, %rax
-; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK5-NEXT: movq %r8, %r9
-; FALLBACK5-NEXT: shlq %cl, %r9
-; FALLBACK5-NEXT: shldq %cl, %r8, %rax
-; FALLBACK5-NEXT: movq %rax, 8(%rdx)
-; FALLBACK5-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK5-NEXT: movq %r9, (%rdx)
-; FALLBACK5-NEXT: retq
-;
-; FALLBACK6-LABEL: shl_32bytes_dwordOff:
-; FALLBACK6: # %bb.0:
-; FALLBACK6-NEXT: movups (%rdi), %xmm0
-; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT: movzbl (%rsi), %ecx
-; FALLBACK6-NEXT: movl %ecx, %eax
-; FALLBACK6-NEXT: shlb $5, %al
-; FALLBACK6-NEXT: xorps %xmm2, %xmm2
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: shlb $2, %cl
-; FALLBACK6-NEXT: andb $24, %cl
-; FALLBACK6-NEXT: negb %cl
-; FALLBACK6-NEXT: movsbq %cl, %rcx
-; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8
-; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9
-; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10
-; FALLBACK6-NEXT: shlxq %rax, %r9, %r11
-; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK6-NEXT: notb %al
-; FALLBACK6-NEXT: shrq %rdi
-; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT: orq %rsi, %rdi
-; FALLBACK6-NEXT: shrq %rcx
-; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT: orq %r8, %rcx
-; FALLBACK6-NEXT: shrq %r9
-; FALLBACK6-NEXT: shrxq %rax, %r9, %rax
-; FALLBACK6-NEXT: orq %r10, %rax
-; FALLBACK6-NEXT: movq %r11, (%rdx)
-; FALLBACK6-NEXT: movq %rax, 8(%rdx)
-; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK6-NEXT: retq
-;
-; FALLBACK7-LABEL: shl_32bytes_dwordOff:
-; FALLBACK7: # %bb.0:
-; FALLBACK7-NEXT: movups (%rdi), %xmm0
-; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK7-NEXT: movzbl (%rsi), %eax
-; FALLBACK7-NEXT: movl %eax, %ecx
-; FALLBACK7-NEXT: shlb $5, %cl
-; FALLBACK7-NEXT: xorps %xmm2, %xmm2
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: shlb $2, %al
-; FALLBACK7-NEXT: andb $24, %al
-; FALLBACK7-NEXT: negb %al
-; FALLBACK7-NEXT: movsbq %al, %rax
-; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9
-; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK7-NEXT: shldq %cl, %r8, %rax
-; FALLBACK7-NEXT: movq %rax, 8(%rdx)
-; FALLBACK7-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK7-NEXT: movq %r9, (%rdx)
-; FALLBACK7-NEXT: retq
-;
-; FALLBACK8-LABEL: shl_32bytes_dwordOff:
-; FALLBACK8: # %bb.0:
-; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK8-NEXT: movzbl (%rsi), %ecx
-; FALLBACK8-NEXT: movl %ecx, %eax
-; FALLBACK8-NEXT: shlb $5, %al
-; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: shlb $2, %cl
-; FALLBACK8-NEXT: andb $24, %cl
-; FALLBACK8-NEXT: negb %cl
-; FALLBACK8-NEXT: movsbq %cl, %r8
-; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r9
-; FALLBACK8-NEXT: movl %eax, %esi
-; FALLBACK8-NEXT: notb %sil
-; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10
-; FALLBACK8-NEXT: movq %r10, %rdi
-; FALLBACK8-NEXT: shrq %rdi
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %rdi
-; FALLBACK8-NEXT: orq %r9, %rdi
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r10
-; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9
-; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8
-; FALLBACK8-NEXT: movq %r8, %r11
-; FALLBACK8-NEXT: shrq %r11
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r11
-; FALLBACK8-NEXT: orq %r10, %r11
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r8
-; FALLBACK8-NEXT: movq %r9, %r10
-; FALLBACK8-NEXT: shrq %r10
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r10
-; FALLBACK8-NEXT: orq %r8, %r10
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r9
-; FALLBACK8-NEXT: movq %r9, (%rdx)
-; FALLBACK8-NEXT: movq %r10, 8(%rdx)
-; FALLBACK8-NEXT: movq %r11, 16(%rdx)
-; FALLBACK8-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK8-NEXT: vzeroupper
-; FALLBACK8-NEXT: retq
-;
-; FALLBACK9-LABEL: shl_32bytes_dwordOff:
-; FALLBACK9: # %bb.0:
-; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK9-NEXT: movzbl (%rsi), %eax
-; FALLBACK9-NEXT: movl %eax, %ecx
-; FALLBACK9-NEXT: shlb $5, %cl
-; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: shlb $2, %al
-; FALLBACK9-NEXT: andb $24, %al
-; FALLBACK9-NEXT: negb %al
-; FALLBACK9-NEXT: movsbq %al, %rax
-; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK9-NEXT: movq %r8, %r9
-; FALLBACK9-NEXT: shlq %cl, %r9
-; FALLBACK9-NEXT: shldq %cl, %r8, %rax
-; FALLBACK9-NEXT: movq %rax, 8(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK9-NEXT: movq %r9, (%rdx)
-; FALLBACK9-NEXT: vzeroupper
-; FALLBACK9-NEXT: retq
-;
-; FALLBACK10-LABEL: shl_32bytes_dwordOff:
-; FALLBACK10: # %bb.0:
-; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT: movzbl (%rsi), %ecx
-; FALLBACK10-NEXT: movl %ecx, %eax
-; FALLBACK10-NEXT: shlb $5, %al
-; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: shlb $2, %cl
-; FALLBACK10-NEXT: andb $24, %cl
-; FALLBACK10-NEXT: negb %cl
-; FALLBACK10-NEXT: movsbq %cl, %rcx
-; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8
-; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9
-; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10
-; FALLBACK10-NEXT: shlxq %rax, %r9, %r11
-; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT: notb %al
-; FALLBACK10-NEXT: shrq %rdi
-; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT: orq %rsi, %rdi
-; FALLBACK10-NEXT: shrq %rcx
-; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT: orq %r8, %rcx
-; FALLBACK10-NEXT: shrq %r9
-; FALLBACK10-NEXT: shrxq %rax, %r9, %rax
-; FALLBACK10-NEXT: orq %r10, %rax
-; FALLBACK10-NEXT: movq %r11, (%rdx)
-; FALLBACK10-NEXT: movq %rax, 8(%rdx)
-; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK10-NEXT: vzeroupper
-; FALLBACK10-NEXT: retq
-;
-; FALLBACK11-LABEL: shl_32bytes_dwordOff:
-; FALLBACK11: # %bb.0:
-; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK11-NEXT: movzbl (%rsi), %eax
-; FALLBACK11-NEXT: movl %eax, %ecx
-; FALLBACK11-NEXT: shlb $5, %cl
-; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: shlb $2, %al
-; FALLBACK11-NEXT: andb $24, %al
-; FALLBACK11-NEXT: negb %al
-; FALLBACK11-NEXT: movsbq %al, %rax
-; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9
-; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK11-NEXT: shldq %cl, %r8, %rax
-; FALLBACK11-NEXT: movq %rax, 8(%rdx)
-; FALLBACK11-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK11-NEXT: movq %r9, (%rdx)
-; FALLBACK11-NEXT: vzeroupper
-; FALLBACK11-NEXT: retq
-;
-; FALLBACK12-LABEL: shl_32bytes_dwordOff:
-; FALLBACK12: # %bb.0:
-; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK12-NEXT: movzbl (%rsi), %ecx
-; FALLBACK12-NEXT: movl %ecx, %eax
-; FALLBACK12-NEXT: shlb $5, %al
-; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: shlb $2, %cl
-; FALLBACK12-NEXT: andb $24, %cl
-; FALLBACK12-NEXT: negb %cl
-; FALLBACK12-NEXT: movsbq %cl, %r8
-; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r9
-; FALLBACK12-NEXT: movl %eax, %esi
-; FALLBACK12-NEXT: notb %sil
-; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10
-; FALLBACK12-NEXT: movq %r10, %rdi
-; FALLBACK12-NEXT: shrq %rdi
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %rdi
-; FALLBACK12-NEXT: orq %r9, %rdi
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r10
-; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9
-; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8
-; FALLBACK12-NEXT: movq %r8, %r11
-; FALLBACK12-NEXT: shrq %r11
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r11
-; FALLBACK12-NEXT: orq %r10, %r11
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r8
-; FALLBACK12-NEXT: movq %r9, %r10
-; FALLBACK12-NEXT: shrq %r10
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: orq %r8, %r10
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r9
-; FALLBACK12-NEXT: movq %r9, (%rdx)
-; FALLBACK12-NEXT: movq %r10, 8(%rdx)
-; FALLBACK12-NEXT: movq %r11, 16(%rdx)
-; FALLBACK12-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK12-NEXT: vzeroupper
-; FALLBACK12-NEXT: retq
-;
-; FALLBACK13-LABEL: shl_32bytes_dwordOff:
-; FALLBACK13: # %bb.0:
-; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK13-NEXT: movzbl (%rsi), %eax
-; FALLBACK13-NEXT: movl %eax, %ecx
-; FALLBACK13-NEXT: shlb $5, %cl
-; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: shlb $2, %al
-; FALLBACK13-NEXT: andb $24, %al
-; FALLBACK13-NEXT: negb %al
-; FALLBACK13-NEXT: movsbq %al, %rax
-; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK13-NEXT: movq %r8, %r9
-; FALLBACK13-NEXT: shlq %cl, %r9
-; FALLBACK13-NEXT: shldq %cl, %r8, %rax
-; FALLBACK13-NEXT: movq %rax, 8(%rdx)
-; FALLBACK13-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK13-NEXT: movq %r9, (%rdx)
-; FALLBACK13-NEXT: vzeroupper
-; FALLBACK13-NEXT: retq
-;
-; FALLBACK14-LABEL: shl_32bytes_dwordOff:
-; FALLBACK14: # %bb.0:
-; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT: movzbl (%rsi), %ecx
-; FALLBACK14-NEXT: movl %ecx, %eax
-; FALLBACK14-NEXT: shlb $5, %al
-; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: shlb $2, %cl
-; FALLBACK14-NEXT: andb $24, %cl
-; FALLBACK14-NEXT: negb %cl
-; FALLBACK14-NEXT: movsbq %cl, %rcx
-; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8
-; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9
-; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10
-; FALLBACK14-NEXT: shlxq %rax, %r9, %r11
-; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT: notb %al
-; FALLBACK14-NEXT: shrq %rdi
-; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT: orq %rsi, %rdi
-; FALLBACK14-NEXT: shrq %rcx
-; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT: orq %r8, %rcx
-; FALLBACK14-NEXT: shrq %r9
-; FALLBACK14-NEXT: shrxq %rax, %r9, %rax
-; FALLBACK14-NEXT: orq %r10, %rax
-; FALLBACK14-NEXT: movq %r11, (%rdx)
-; FALLBACK14-NEXT: movq %rax, 8(%rdx)
-; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK14-NEXT: vzeroupper
-; FALLBACK14-NEXT: retq
-;
-; FALLBACK15-LABEL: shl_32bytes_dwordOff:
-; FALLBACK15: # %bb.0:
-; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK15-NEXT: movzbl (%rsi), %eax
-; FALLBACK15-NEXT: movl %eax, %ecx
-; FALLBACK15-NEXT: shlb $5, %cl
-; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: shlb $2, %al
-; FALLBACK15-NEXT: andb $24, %al
-; FALLBACK15-NEXT: negb %al
-; FALLBACK15-NEXT: movsbq %al, %rax
-; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi
-; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi
-; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi
-; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8
-; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax
-; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9
-; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK15-NEXT: shldq %cl, %r8, %rax
-; FALLBACK15-NEXT: movq %rax, 8(%rdx)
-; FALLBACK15-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK15-NEXT: movq %r9, (%rdx)
-; FALLBACK15-NEXT: vzeroupper
-; FALLBACK15-NEXT: retq
+; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $2, %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r10), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r10), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r10), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r10), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $2, %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negb %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movsbq %sil, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $2, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rdi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rcx, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_32bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $2, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negb %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movsbq %sil, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $2, %cl
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %cl
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negb %cl
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movsbq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $2, %al
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %al
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negb %al
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movsbq %al, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $2, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negb %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %sil, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rsi, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rsi, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_32bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm2, %xmm2
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $2, %al
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %al
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negb %al
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movsbq %al, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %eax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $2, %cl
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %cl
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: negb %cl
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movsbq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $2, %al
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %al
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: negb %al
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movsbq %al, %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $2, %sil
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: negb %sil
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %sil, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, -16(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rsi, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rsi, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r9, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rax, %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: shl_32bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $2, %al
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %al
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: negb %al
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movsbq %al, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -24(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -16(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rsi, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -40(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -32(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shldq %cl, %r8, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X86-SSE2-LABEL: shl_32bytes_dwordOff:
; X86-SSE2: # %bb.0:
@@ -8800,2193 +7209,1656 @@ define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
}
define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; FALLBACK0-LABEL: ashr_32bytes:
-; FALLBACK0: # %bb.0:
-; FALLBACK0-NEXT: pushq %rbx
-; FALLBACK0-NEXT: movq (%rdi), %rcx
-; FALLBACK0-NEXT: movq 8(%rdi), %r8
-; FALLBACK0-NEXT: movq 16(%rdi), %r9
-; FALLBACK0-NEXT: movq 24(%rdi), %rdi
-; FALLBACK0-NEXT: movzbl (%rsi), %esi
-; FALLBACK0-NEXT: leal (,%rsi,8), %eax
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: sarq $63, %rdi
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: andb $24, %sil
-; FALLBACK0-NEXT: movzbl %sil, %r9d
-; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10
-; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi
-; FALLBACK0-NEXT: movq %rdi, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r11
-; FALLBACK0-NEXT: movl %eax, %esi
-; FALLBACK0-NEXT: notb %sil
-; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx
-; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r8
-; FALLBACK0-NEXT: orq %r11, %r8
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r10
-; FALLBACK0-NEXT: addq %rdi, %rdi
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %rdi
-; FALLBACK0-NEXT: orq %r10, %rdi
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rbx
-; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9
-; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r10
-; FALLBACK0-NEXT: orq %rbx, %r10
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: sarq %cl, %r9
-; FALLBACK0-NEXT: movq %r9, 24(%rdx)
-; FALLBACK0-NEXT: movq %r10, 16(%rdx)
-; FALLBACK0-NEXT: movq %rdi, (%rdx)
-; FALLBACK0-NEXT: movq %r8, 8(%rdx)
-; FALLBACK0-NEXT: popq %rbx
-; FALLBACK0-NEXT: retq
-;
-; FALLBACK1-LABEL: ashr_32bytes:
-; FALLBACK1: # %bb.0:
-; FALLBACK1-NEXT: movq (%rdi), %rax
-; FALLBACK1-NEXT: movq 8(%rdi), %r8
-; FALLBACK1-NEXT: movq 16(%rdi), %r9
-; FALLBACK1-NEXT: movq 24(%rdi), %rdi
-; FALLBACK1-NEXT: movzbl (%rsi), %esi
-; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: sarq $63, %rdi
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: andb $24, %sil
-; FALLBACK1-NEXT: movzbl %sil, %eax
-; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi
-; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi
-; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8
-; FALLBACK1-NEXT: movq %r8, %r9
-; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
-; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax
-; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
-; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
-; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK1-NEXT: sarq %cl, %rax
-; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK1-NEXT: movq %rax, 24(%rdx)
-; FALLBACK1-NEXT: movq %rdi, (%rdx)
-; FALLBACK1-NEXT: movq %r9, 8(%rdx)
-; FALLBACK1-NEXT: retq
-;
-; FALLBACK2-LABEL: ashr_32bytes:
-; FALLBACK2: # %bb.0:
-; FALLBACK2-NEXT: movq (%rdi), %rcx
-; FALLBACK2-NEXT: movq 8(%rdi), %r8
-; FALLBACK2-NEXT: movq 16(%rdi), %r9
-; FALLBACK2-NEXT: movq 24(%rdi), %rdi
-; FALLBACK2-NEXT: movzbl (%rsi), %esi
-; FALLBACK2-NEXT: leal (,%rsi,8), %eax
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: sarq $63, %rdi
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: andb $24, %sil
-; FALLBACK2-NEXT: movzbl %sil, %ecx
-; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi
-; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi
-; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9
-; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx
-; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11
-; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK2-NEXT: notb %al
-; FALLBACK2-NEXT: addq %rdi, %rdi
-; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT: orq %r8, %rdi
-; FALLBACK2-NEXT: addq %rsi, %rsi
-; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT: orq %r9, %rsi
-; FALLBACK2-NEXT: addq %rcx, %rcx
-; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT: orq %r10, %rax
-; FALLBACK2-NEXT: movq %r11, 24(%rdx)
-; FALLBACK2-NEXT: movq %rax, 16(%rdx)
-; FALLBACK2-NEXT: movq %rsi, (%rdx)
-; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
-; FALLBACK2-NEXT: retq
-;
-; FALLBACK3-LABEL: ashr_32bytes:
-; FALLBACK3: # %bb.0:
-; FALLBACK3-NEXT: movq (%rdi), %rax
-; FALLBACK3-NEXT: movq 8(%rdi), %r8
-; FALLBACK3-NEXT: movq 16(%rdi), %r9
-; FALLBACK3-NEXT: movq 24(%rdi), %rdi
-; FALLBACK3-NEXT: movzbl (%rsi), %esi
-; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: sarq $63, %rdi
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: andb $24, %sil
-; FALLBACK3-NEXT: movzbl %sil, %eax
-; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi
-; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi
-; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8
-; FALLBACK3-NEXT: movq %r8, %r9
-; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
-; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax
-; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
-; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
-; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
-; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK3-NEXT: movq %rax, 24(%rdx)
-; FALLBACK3-NEXT: movq %rdi, (%rdx)
-; FALLBACK3-NEXT: movq %r9, 8(%rdx)
-; FALLBACK3-NEXT: retq
-;
-; FALLBACK4-LABEL: ashr_32bytes:
-; FALLBACK4: # %bb.0:
-; FALLBACK4-NEXT: pushq %rbx
-; FALLBACK4-NEXT: movups (%rdi), %xmm0
-; FALLBACK4-NEXT: movq 16(%rdi), %rcx
-; FALLBACK4-NEXT: movq 24(%rdi), %rdi
-; FALLBACK4-NEXT: movzbl (%rsi), %esi
-; FALLBACK4-NEXT: leal (,%rsi,8), %eax
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: sarq $63, %rdi
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: andb $24, %sil
-; FALLBACK4-NEXT: movzbl %sil, %r9d
-; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10
-; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r10
-; FALLBACK4-NEXT: movl %eax, %esi
-; FALLBACK4-NEXT: notb %sil
-; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rdi
-; FALLBACK4-NEXT: orq %r10, %rdi
-; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10
-; FALLBACK4-NEXT: movq %r10, %r11
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r11
-; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9
-; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rbx
-; FALLBACK4-NEXT: orq %r11, %rbx
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r8
-; FALLBACK4-NEXT: addq %r10, %r10
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r10
-; FALLBACK4-NEXT: orq %r8, %r10
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: sarq %cl, %r9
-; FALLBACK4-NEXT: movq %r9, 24(%rdx)
-; FALLBACK4-NEXT: movq %r10, 8(%rdx)
-; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK4-NEXT: movq %rdi, (%rdx)
-; FALLBACK4-NEXT: popq %rbx
-; FALLBACK4-NEXT: retq
-;
-; FALLBACK5-LABEL: ashr_32bytes:
-; FALLBACK5: # %bb.0:
-; FALLBACK5-NEXT: movups (%rdi), %xmm0
-; FALLBACK5-NEXT: movq 16(%rdi), %rax
-; FALLBACK5-NEXT: movq 24(%rdi), %rdi
-; FALLBACK5-NEXT: movzbl (%rsi), %esi
-; FALLBACK5-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: sarq $63, %rdi
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: andb $24, %sil
-; FALLBACK5-NEXT: movzbl %sil, %eax
-; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK5-NEXT: movq %rdi, %r8
-; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK5-NEXT: movq %rax, %r10
-; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK5-NEXT: sarq %cl, %rsi
-; FALLBACK5-NEXT: movq %r10, 8(%rdx)
-; FALLBACK5-NEXT: movq %r8, 16(%rdx)
-; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK5-NEXT: movq %r9, (%rdx)
-; FALLBACK5-NEXT: retq
-;
-; FALLBACK6-LABEL: ashr_32bytes:
-; FALLBACK6: # %bb.0:
-; FALLBACK6-NEXT: movups (%rdi), %xmm0
-; FALLBACK6-NEXT: movq 16(%rdi), %rcx
-; FALLBACK6-NEXT: movq 24(%rdi), %rdi
-; FALLBACK6-NEXT: movzbl (%rsi), %esi
-; FALLBACK6-NEXT: leal (,%rsi,8), %eax
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: sarq $63, %rdi
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: andb $24, %sil
-; FALLBACK6-NEXT: movzbl %sil, %ecx
-; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi
-; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8
-; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx
-; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11
-; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK6-NEXT: notb %al
-; FALLBACK6-NEXT: addq %rdi, %rdi
-; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT: orq %rsi, %rdi
-; FALLBACK6-NEXT: addq %rcx, %rcx
-; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT: orq %r9, %rcx
-; FALLBACK6-NEXT: addq %r8, %r8
-; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT: orq %r10, %rax
-; FALLBACK6-NEXT: movq %r11, 24(%rdx)
-; FALLBACK6-NEXT: movq %rax, 8(%rdx)
-; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT: movq %rdi, (%rdx)
-; FALLBACK6-NEXT: retq
-;
-; FALLBACK7-LABEL: ashr_32bytes:
-; FALLBACK7: # %bb.0:
-; FALLBACK7-NEXT: movups (%rdi), %xmm0
-; FALLBACK7-NEXT: movq 16(%rdi), %rax
-; FALLBACK7-NEXT: movq 24(%rdi), %rdi
-; FALLBACK7-NEXT: movzbl (%rsi), %esi
-; FALLBACK7-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: sarq $63, %rdi
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: andb $24, %sil
-; FALLBACK7-NEXT: movzbl %sil, %eax
-; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK7-NEXT: movq %rdi, %r8
-; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK7-NEXT: movq %rax, %r10
-; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax
-; FALLBACK7-NEXT: movq %r10, 8(%rdx)
-; FALLBACK7-NEXT: movq %r8, 16(%rdx)
-; FALLBACK7-NEXT: movq %rax, 24(%rdx)
-; FALLBACK7-NEXT: movq %r9, (%rdx)
-; FALLBACK7-NEXT: retq
-;
-; FALLBACK8-LABEL: ashr_32bytes:
-; FALLBACK8: # %bb.0:
-; FALLBACK8-NEXT: pushq %rbx
-; FALLBACK8-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK8-NEXT: movq 16(%rdi), %rcx
-; FALLBACK8-NEXT: movq 24(%rdi), %rdi
-; FALLBACK8-NEXT: movzbl (%rsi), %esi
-; FALLBACK8-NEXT: leal (,%rsi,8), %eax
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: sarq $63, %rdi
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: andb $24, %sil
-; FALLBACK8-NEXT: movzbl %sil, %r9d
-; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10
-; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r10
-; FALLBACK8-NEXT: movl %eax, %esi
-; FALLBACK8-NEXT: notb %sil
-; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rdi
-; FALLBACK8-NEXT: orq %r10, %rdi
-; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10
-; FALLBACK8-NEXT: movq %r10, %r11
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r11
-; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9
-; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rbx
-; FALLBACK8-NEXT: orq %r11, %rbx
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r8
-; FALLBACK8-NEXT: addq %r10, %r10
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r10
-; FALLBACK8-NEXT: orq %r8, %r10
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: sarq %cl, %r9
-; FALLBACK8-NEXT: movq %r9, 24(%rdx)
-; FALLBACK8-NEXT: movq %r10, 8(%rdx)
-; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK8-NEXT: movq %rdi, (%rdx)
-; FALLBACK8-NEXT: popq %rbx
-; FALLBACK8-NEXT: retq
-;
-; FALLBACK9-LABEL: ashr_32bytes:
-; FALLBACK9: # %bb.0:
-; FALLBACK9-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK9-NEXT: movq 16(%rdi), %rax
-; FALLBACK9-NEXT: movq 24(%rdi), %rdi
-; FALLBACK9-NEXT: movzbl (%rsi), %esi
-; FALLBACK9-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: sarq $63, %rdi
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: andb $24, %sil
-; FALLBACK9-NEXT: movzbl %sil, %eax
-; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK9-NEXT: movq %rdi, %r8
-; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK9-NEXT: movq %rax, %r10
-; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK9-NEXT: sarq %cl, %rsi
-; FALLBACK9-NEXT: movq %r10, 8(%rdx)
-; FALLBACK9-NEXT: movq %r8, 16(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK9-NEXT: movq %r9, (%rdx)
-; FALLBACK9-NEXT: retq
-;
-; FALLBACK10-LABEL: ashr_32bytes:
-; FALLBACK10: # %bb.0:
-; FALLBACK10-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK10-NEXT: movq 16(%rdi), %rcx
-; FALLBACK10-NEXT: movq 24(%rdi), %rdi
-; FALLBACK10-NEXT: movzbl (%rsi), %esi
-; FALLBACK10-NEXT: leal (,%rsi,8), %eax
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: sarq $63, %rdi
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: andb $24, %sil
-; FALLBACK10-NEXT: movzbl %sil, %ecx
-; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi
-; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8
-; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx
-; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11
-; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT: notb %al
-; FALLBACK10-NEXT: addq %rdi, %rdi
-; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT: orq %rsi, %rdi
-; FALLBACK10-NEXT: addq %rcx, %rcx
-; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT: orq %r9, %rcx
-; FALLBACK10-NEXT: addq %r8, %r8
-; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT: orq %r10, %rax
-; FALLBACK10-NEXT: movq %r11, 24(%rdx)
-; FALLBACK10-NEXT: movq %rax, 8(%rdx)
-; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT: movq %rdi, (%rdx)
-; FALLBACK10-NEXT: retq
-;
-; FALLBACK11-LABEL: ashr_32bytes:
-; FALLBACK11: # %bb.0:
-; FALLBACK11-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK11-NEXT: movq 16(%rdi), %rax
-; FALLBACK11-NEXT: movq 24(%rdi), %rdi
-; FALLBACK11-NEXT: movzbl (%rsi), %esi
-; FALLBACK11-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: sarq $63, %rdi
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: andb $24, %sil
-; FALLBACK11-NEXT: movzbl %sil, %eax
-; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK11-NEXT: movq %rdi, %r8
-; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK11-NEXT: movq %rax, %r10
-; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax
-; FALLBACK11-NEXT: movq %r10, 8(%rdx)
-; FALLBACK11-NEXT: movq %r8, 16(%rdx)
-; FALLBACK11-NEXT: movq %rax, 24(%rdx)
-; FALLBACK11-NEXT: movq %r9, (%rdx)
-; FALLBACK11-NEXT: retq
-;
-; FALLBACK12-LABEL: ashr_32bytes:
-; FALLBACK12: # %bb.0:
-; FALLBACK12-NEXT: pushq %rbx
-; FALLBACK12-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK12-NEXT: movq 16(%rdi), %rcx
-; FALLBACK12-NEXT: movq 24(%rdi), %rdi
-; FALLBACK12-NEXT: movzbl (%rsi), %esi
-; FALLBACK12-NEXT: leal (,%rsi,8), %eax
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: sarq $63, %rdi
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: andb $24, %sil
-; FALLBACK12-NEXT: movzbl %sil, %r9d
-; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10
-; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: movl %eax, %esi
-; FALLBACK12-NEXT: notb %sil
-; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rdi
-; FALLBACK12-NEXT: orq %r10, %rdi
-; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10
-; FALLBACK12-NEXT: movq %r10, %r11
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r11
-; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9
-; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rbx
-; FALLBACK12-NEXT: orq %r11, %rbx
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r8
-; FALLBACK12-NEXT: addq %r10, %r10
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r10
-; FALLBACK12-NEXT: orq %r8, %r10
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: sarq %cl, %r9
-; FALLBACK12-NEXT: movq %r9, 24(%rdx)
-; FALLBACK12-NEXT: movq %r10, 8(%rdx)
-; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK12-NEXT: movq %rdi, (%rdx)
-; FALLBACK12-NEXT: popq %rbx
-; FALLBACK12-NEXT: retq
-;
-; FALLBACK13-LABEL: ashr_32bytes:
-; FALLBACK13: # %bb.0:
-; FALLBACK13-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK13-NEXT: movq 16(%rdi), %rax
-; FALLBACK13-NEXT: movq 24(%rdi), %rdi
-; FALLBACK13-NEXT: movzbl (%rsi), %esi
-; FALLBACK13-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: sarq $63, %rdi
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: andb $24, %sil
-; FALLBACK13-NEXT: movzbl %sil, %eax
-; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK13-NEXT: movq %rdi, %r8
-; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK13-NEXT: movq %rax, %r10
-; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK13-NEXT: sarq %cl, %rsi
-; FALLBACK13-NEXT: movq %r10, 8(%rdx)
-; FALLBACK13-NEXT: movq %r8, 16(%rdx)
-; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK13-NEXT: movq %r9, (%rdx)
-; FALLBACK13-NEXT: retq
-;
-; FALLBACK14-LABEL: ashr_32bytes:
-; FALLBACK14: # %bb.0:
-; FALLBACK14-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK14-NEXT: movq 16(%rdi), %rcx
-; FALLBACK14-NEXT: movq 24(%rdi), %rdi
-; FALLBACK14-NEXT: movzbl (%rsi), %esi
-; FALLBACK14-NEXT: leal (,%rsi,8), %eax
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: sarq $63, %rdi
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: andb $24, %sil
-; FALLBACK14-NEXT: movzbl %sil, %ecx
-; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi
-; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi
-; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8
-; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx
-; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11
-; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT: notb %al
-; FALLBACK14-NEXT: addq %rdi, %rdi
-; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT: orq %rsi, %rdi
-; FALLBACK14-NEXT: addq %rcx, %rcx
-; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT: orq %r9, %rcx
-; FALLBACK14-NEXT: addq %r8, %r8
-; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT: orq %r10, %rax
-; FALLBACK14-NEXT: movq %r11, 24(%rdx)
-; FALLBACK14-NEXT: movq %rax, 8(%rdx)
-; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT: movq %rdi, (%rdx)
-; FALLBACK14-NEXT: retq
-;
-; FALLBACK15-LABEL: ashr_32bytes:
-; FALLBACK15: # %bb.0:
-; FALLBACK15-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK15-NEXT: movq 16(%rdi), %rax
-; FALLBACK15-NEXT: movq 24(%rdi), %rdi
-; FALLBACK15-NEXT: movzbl (%rsi), %esi
-; FALLBACK15-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: sarq $63, %rdi
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: andb $24, %sil
-; FALLBACK15-NEXT: movzbl %sil, %eax
-; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi
-; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi
-; FALLBACK15-NEXT: movq %rdi, %r8
-; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9
-; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax
-; FALLBACK15-NEXT: movq %rax, %r10
-; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax
-; FALLBACK15-NEXT: movq %r10, 8(%rdx)
-; FALLBACK15-NEXT: movq %r8, 16(%rdx)
-; FALLBACK15-NEXT: movq %rax, 24(%rdx)
-; FALLBACK15-NEXT: movq %r9, (%rdx)
-; FALLBACK15-NEXT: retq
-;
-; FALLBACK16-LABEL: ashr_32bytes:
-; FALLBACK16: # %bb.0:
-; FALLBACK16-NEXT: pushl %ebp
-; FALLBACK16-NEXT: pushl %ebx
-; FALLBACK16-NEXT: pushl %edi
-; FALLBACK16-NEXT: pushl %esi
-; FALLBACK16-NEXT: subl $108, %esp
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK16-NEXT: movl (%esi), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 4(%esi), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 8(%esi), %ebx
-; FALLBACK16-NEXT: movl 12(%esi), %ebp
-; FALLBACK16-NEXT: movl 16(%esi), %edi
-; FALLBACK16-NEXT: movzbl (%eax), %ecx
-; FALLBACK16-NEXT: movl 20(%esi), %edx
-; FALLBACK16-NEXT: movl 24(%esi), %eax
-; FALLBACK16-NEXT: movl 28(%esi), %esi
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, %edx
-; FALLBACK16-NEXT: shlb $3, %dl
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: sarl $31, %esi
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: andb $28, %cl
-; FALLBACK16-NEXT: movzbl %cl, %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi
-; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax
-; FALLBACK16-NEXT: movl %eax, %ebx
-; FALLBACK16-NEXT: movl %edx, %ecx
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: movb %dl, %ch
-; FALLBACK16-NEXT: notb %ch
-; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi
-; FALLBACK16-NEXT: leal (%edi,%edi), %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %ebx, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %esi
-; FALLBACK16-NEXT: addl %eax, %eax
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: orl %esi, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp
-; FALLBACK16-NEXT: movl %ebp, %esi
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: movl %edx, %ebx
-; FALLBACK16-NEXT: shrl %cl, %esi
-; FALLBACK16-NEXT: movl 48(%esp,%eax), %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: leal (%edx,%edx), %eax
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: orl %esi, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl %ebx, %edx
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: addl %ebp, %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %edi, %ebp
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK16-NEXT: movl 52(%esp,%esi), %edi
-; FALLBACK16-NEXT: movl %edi, %eax
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: movl 56(%esp,%esi), %ebx
-; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: orl %eax, %esi
-; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: addl %edi, %edi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: orl %eax, %edi
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl 60(%esp,%eax), %eax
-; FALLBACK16-NEXT: leal (%eax,%eax), %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: orl %ebx, %edx
-; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; FALLBACK16-NEXT: sarl %cl, %eax
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT: movl %eax, 28(%ecx)
-; FALLBACK16-NEXT: movl %edx, 24(%ecx)
-; FALLBACK16-NEXT: movl %edi, 16(%ecx)
-; FALLBACK16-NEXT: movl %esi, 20(%ecx)
-; FALLBACK16-NEXT: movl %ebp, 8(%ecx)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, 12(%ecx)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, (%ecx)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, 4(%ecx)
-; FALLBACK16-NEXT: addl $108, %esp
-; FALLBACK16-NEXT: popl %esi
-; FALLBACK16-NEXT: popl %edi
-; FALLBACK16-NEXT: popl %ebx
-; FALLBACK16-NEXT: popl %ebp
-; FALLBACK16-NEXT: retl
-;
-; FALLBACK17-LABEL: ashr_32bytes:
-; FALLBACK17: # %bb.0:
-; FALLBACK17-NEXT: pushl %ebp
-; FALLBACK17-NEXT: pushl %ebx
-; FALLBACK17-NEXT: pushl %edi
-; FALLBACK17-NEXT: pushl %esi
-; FALLBACK17-NEXT: subl $92, %esp
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl (%ecx), %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 4(%ecx), %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 8(%ecx), %edx
-; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: movl 12(%ecx), %ebp
-; FALLBACK17-NEXT: movl 16(%ecx), %ebx
-; FALLBACK17-NEXT: movzbl (%eax), %eax
-; FALLBACK17-NEXT: movl 20(%ecx), %edi
-; FALLBACK17-NEXT: movl 24(%ecx), %edx
-; FALLBACK17-NEXT: movl 28(%ecx), %esi
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, %ecx
-; FALLBACK17-NEXT: shlb $3, %cl
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: sarl $31, %esi
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: andb $28, %al
-; FALLBACK17-NEXT: movzbl %al, %ebp
-; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx
-; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx
-; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %esi
-; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi
-; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx
-; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edi
-; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi
-; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK17-NEXT: movl %edx, 24(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: shrdl %cl, %edx, %esi
-; FALLBACK17-NEXT: sarl %cl, %eax
-; FALLBACK17-NEXT: movl %eax, 28(%ebp)
-; FALLBACK17-NEXT: movl %ebx, 16(%ebp)
-; FALLBACK17-NEXT: movl %edi, 20(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 8(%ebp)
-; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 12(%ebp)
-; FALLBACK17-NEXT: movl %esi, (%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 4(%ebp)
-; FALLBACK17-NEXT: addl $92, %esp
-; FALLBACK17-NEXT: popl %esi
-; FALLBACK17-NEXT: popl %edi
-; FALLBACK17-NEXT: popl %ebx
-; FALLBACK17-NEXT: popl %ebp
-; FALLBACK17-NEXT: retl
-;
-; FALLBACK18-LABEL: ashr_32bytes:
-; FALLBACK18: # %bb.0:
-; FALLBACK18-NEXT: pushl %ebp
-; FALLBACK18-NEXT: pushl %ebx
-; FALLBACK18-NEXT: pushl %edi
-; FALLBACK18-NEXT: pushl %esi
-; FALLBACK18-NEXT: subl $108, %esp
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi
-; FALLBACK18-NEXT: movl (%esi), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 4(%esi), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 8(%esi), %ebx
-; FALLBACK18-NEXT: movl 12(%esi), %ebp
-; FALLBACK18-NEXT: movl 16(%esi), %edi
-; FALLBACK18-NEXT: movzbl (%ecx), %ecx
-; FALLBACK18-NEXT: movl 20(%esi), %edx
-; FALLBACK18-NEXT: movl 24(%esi), %eax
-; FALLBACK18-NEXT: movl 28(%esi), %esi
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, %eax
-; FALLBACK18-NEXT: shlb $3, %al
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: sarl $31, %esi
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: andb $28, %cl
-; FALLBACK18-NEXT: movzbl %cl, %edi
-; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx
-; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx
-; FALLBACK18-NEXT: movl %eax, %edx
-; FALLBACK18-NEXT: notb %dl
-; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp
-; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp
-; FALLBACK18-NEXT: orl %ebx, %ebp
-; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx
-; FALLBACK18-NEXT: addl %esi, %esi
-; FALLBACK18-NEXT: shlxl %edx, %esi, %esi
-; FALLBACK18-NEXT: orl %ebx, %esi
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: leal (%esi,%esi), %ebx
-; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi
-; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp
-; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx
-; FALLBACK18-NEXT: orl %ebx, %esi
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx
-; FALLBACK18-NEXT: movl %eax, %ebx
-; FALLBACK18-NEXT: addl %ebp, %ebp
-; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax
-; FALLBACK18-NEXT: orl %ecx, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp
-; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx
-; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax
-; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi
-; FALLBACK18-NEXT: orl %esi, %ecx
-; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: addl %eax, %eax
-; FALLBACK18-NEXT: shlxl %edx, %eax, %esi
-; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax
-; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi
-; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %edx, %edi, %edx
-; FALLBACK18-NEXT: orl %eax, %edx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl %ebx, 28(%eax)
-; FALLBACK18-NEXT: movl %edx, 24(%eax)
-; FALLBACK18-NEXT: movl %esi, 16(%eax)
-; FALLBACK18-NEXT: movl %ecx, 20(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 8(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 12(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, (%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 4(%eax)
-; FALLBACK18-NEXT: addl $108, %esp
-; FALLBACK18-NEXT: popl %esi
-; FALLBACK18-NEXT: popl %edi
-; FALLBACK18-NEXT: popl %ebx
-; FALLBACK18-NEXT: popl %ebp
-; FALLBACK18-NEXT: retl
-;
-; FALLBACK19-LABEL: ashr_32bytes:
-; FALLBACK19: # %bb.0:
-; FALLBACK19-NEXT: pushl %ebp
-; FALLBACK19-NEXT: pushl %ebx
-; FALLBACK19-NEXT: pushl %edi
-; FALLBACK19-NEXT: pushl %esi
-; FALLBACK19-NEXT: subl $92, %esp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK19-NEXT: movl (%ecx), %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 4(%ecx), %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 8(%ecx), %edx
-; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: movl 12(%ecx), %ebp
-; FALLBACK19-NEXT: movl 16(%ecx), %ebx
-; FALLBACK19-NEXT: movzbl (%eax), %eax
-; FALLBACK19-NEXT: movl 20(%ecx), %edi
-; FALLBACK19-NEXT: movl 24(%ecx), %edx
-; FALLBACK19-NEXT: movl 28(%ecx), %esi
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, %ecx
-; FALLBACK19-NEXT: shlb $3, %cl
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: sarl $31, %esi
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: andb $28, %al
-; FALLBACK19-NEXT: movzbl %al, %ebp
-; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi
-; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %esi, %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx
-; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx
-; FALLBACK19-NEXT: movl %edx, %esi
-; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx
-; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi
-; FALLBACK19-NEXT: shrdl %cl, %edi, %eax
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK19-NEXT: movl %eax, 24(%ebp)
-; FALLBACK19-NEXT: sarxl %ecx, %edi, %eax
-; FALLBACK19-NEXT: movl %eax, 28(%ebp)
-; FALLBACK19-NEXT: movl %ebx, 16(%ebp)
-; FALLBACK19-NEXT: movl %esi, 20(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 8(%ebp)
-; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 12(%ebp)
-; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK19-NEXT: movl %edx, (%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 4(%ebp)
-; FALLBACK19-NEXT: addl $92, %esp
-; FALLBACK19-NEXT: popl %esi
-; FALLBACK19-NEXT: popl %edi
-; FALLBACK19-NEXT: popl %ebx
-; FALLBACK19-NEXT: popl %ebp
-; FALLBACK19-NEXT: retl
-;
-; FALLBACK20-LABEL: ashr_32bytes:
-; FALLBACK20: # %bb.0:
-; FALLBACK20-NEXT: pushl %ebp
-; FALLBACK20-NEXT: pushl %ebx
-; FALLBACK20-NEXT: pushl %edi
-; FALLBACK20-NEXT: pushl %esi
-; FALLBACK20-NEXT: subl $108, %esp
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: movups (%ecx), %xmm0
-; FALLBACK20-NEXT: movl 16(%ecx), %esi
-; FALLBACK20-NEXT: movl 20(%ecx), %edi
-; FALLBACK20-NEXT: movl 24(%ecx), %ebx
-; FALLBACK20-NEXT: movl 28(%ecx), %edx
-; FALLBACK20-NEXT: movzbl (%eax), %eax
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shlb $3, %cl
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: sarl $31, %edx
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: andb $28, %al
-; FALLBACK20-NEXT: movzbl %al, %edi
-; FALLBACK20-NEXT: movl 32(%esp,%edi), %eax
-; FALLBACK20-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: movl %ecx, %edx
-; FALLBACK20-NEXT: movb %cl, %dh
-; FALLBACK20-NEXT: notb %dl
-; FALLBACK20-NEXT: addl %esi, %esi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: orl %eax, %esi
-; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %eax
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: movl 48(%esp,%edi), %esi
-; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: addl %esi, %esi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: orl %eax, %esi
-; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi
-; FALLBACK20-NEXT: movl %esi, %eax
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: addl %ebx, %ebx
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %eax, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp
-; FALLBACK20-NEXT: movl %ebp, %eax
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx
-; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %eax, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: addl %ebp, %ebp
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %ebp
-; FALLBACK20-NEXT: orl %eax, %ebp
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: movl 60(%esp,%edi), %eax
-; FALLBACK20-NEXT: leal (%eax,%eax), %edi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %edi
-; FALLBACK20-NEXT: orl %ebx, %edi
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: addl %esi, %esi
-; FALLBACK20-NEXT: movl %edx, %ecx
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: orl %ebx, %esi
-; FALLBACK20-NEXT: movb %dh, %cl
-; FALLBACK20-NEXT: sarl %cl, %eax
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: movl %eax, 28(%ecx)
-; FALLBACK20-NEXT: movl %esi, 4(%ecx)
-; FALLBACK20-NEXT: movl %edi, 24(%ecx)
-; FALLBACK20-NEXT: movl %ebp, 16(%ecx)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: movl %eax, 20(%ecx)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: movl %eax, 8(%ecx)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: movl %eax, 12(%ecx)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: movl %eax, (%ecx)
-; FALLBACK20-NEXT: addl $108, %esp
-; FALLBACK20-NEXT: popl %esi
-; FALLBACK20-NEXT: popl %edi
-; FALLBACK20-NEXT: popl %ebx
-; FALLBACK20-NEXT: popl %ebp
-; FALLBACK20-NEXT: retl
-;
-; FALLBACK21-LABEL: ashr_32bytes:
-; FALLBACK21: # %bb.0:
-; FALLBACK21-NEXT: pushl %ebp
-; FALLBACK21-NEXT: pushl %ebx
-; FALLBACK21-NEXT: pushl %edi
-; FALLBACK21-NEXT: pushl %esi
-; FALLBACK21-NEXT: subl $108, %esp
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK21-NEXT: movups (%ecx), %xmm0
-; FALLBACK21-NEXT: movl 16(%ecx), %esi
-; FALLBACK21-NEXT: movl 20(%ecx), %edi
-; FALLBACK21-NEXT: movl 24(%ecx), %ebx
-; FALLBACK21-NEXT: movl 28(%ecx), %edx
-; FALLBACK21-NEXT: movzbl (%eax), %eax
-; FALLBACK21-NEXT: movl %eax, %ecx
-; FALLBACK21-NEXT: shlb $3, %cl
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: sarl $31, %edx
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: andb $28, %al
-; FALLBACK21-NEXT: movzbl %al, %ebp
-; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi
-; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx
-; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi
-; FALLBACK21-NEXT: movl %edi, %esi
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK21-NEXT: movl %esi, 4(%ebp)
-; FALLBACK21-NEXT: movl %ebx, 24(%ebp)
-; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK21-NEXT: sarl %cl, %eax
-; FALLBACK21-NEXT: movl %eax, 28(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 16(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 20(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 8(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 12(%ebp)
-; FALLBACK21-NEXT: movl %edx, (%ebp)
-; FALLBACK21-NEXT: addl $108, %esp
-; FALLBACK21-NEXT: popl %esi
-; FALLBACK21-NEXT: popl %edi
-; FALLBACK21-NEXT: popl %ebx
-; FALLBACK21-NEXT: popl %ebp
-; FALLBACK21-NEXT: retl
-;
-; FALLBACK22-LABEL: ashr_32bytes:
-; FALLBACK22: # %bb.0:
-; FALLBACK22-NEXT: pushl %ebp
-; FALLBACK22-NEXT: pushl %ebx
-; FALLBACK22-NEXT: pushl %edi
-; FALLBACK22-NEXT: pushl %esi
-; FALLBACK22-NEXT: subl $108, %esp
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK22-NEXT: movups (%ecx), %xmm0
-; FALLBACK22-NEXT: movl 16(%ecx), %esi
-; FALLBACK22-NEXT: movl 20(%ecx), %edi
-; FALLBACK22-NEXT: movl 24(%ecx), %ebx
-; FALLBACK22-NEXT: movl 28(%ecx), %edx
-; FALLBACK22-NEXT: movzbl (%eax), %ecx
-; FALLBACK22-NEXT: movl %ecx, %eax
-; FALLBACK22-NEXT: shlb $3, %al
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: sarl $31, %edx
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: andb $28, %cl
-; FALLBACK22-NEXT: movzbl %cl, %edi
-; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
-; FALLBACK22-NEXT: movl %eax, %edx
-; FALLBACK22-NEXT: notb %dl
-; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: addl %esi, %esi
-; FALLBACK22-NEXT: shlxl %edx, %esi, %esi
-; FALLBACK22-NEXT: orl %ecx, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: addl %ecx, %ecx
-; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi
-; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx
-; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx
-; FALLBACK22-NEXT: orl %ebx, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: addl %ecx, %ecx
-; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi
-; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx
-; FALLBACK22-NEXT: movl %eax, %ecx
-; FALLBACK22-NEXT: orl %ebx, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi
-; FALLBACK22-NEXT: leal (%esi,%esi), %ebx
-; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax
-; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx
-; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl %ecx, %eax
-; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK22-NEXT: addl %ebx, %ebx
-; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx
-; FALLBACK22-NEXT: orl %ebp, %ebx
-; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx
-; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi
-; FALLBACK22-NEXT: sarxl %eax, %edi, %eax
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %edx, %edi, %edi
-; FALLBACK22-NEXT: orl %ecx, %edi
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: addl %ecx, %ecx
-; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx
-; FALLBACK22-NEXT: orl %esi, %ecx
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK22-NEXT: movl %eax, 28(%edx)
-; FALLBACK22-NEXT: movl %ecx, 4(%edx)
-; FALLBACK22-NEXT: movl %edi, 24(%edx)
-; FALLBACK22-NEXT: movl %ebx, 16(%edx)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, 20(%edx)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, 8(%edx)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, 12(%edx)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: movl %eax, (%edx)
-; FALLBACK22-NEXT: addl $108, %esp
-; FALLBACK22-NEXT: popl %esi
-; FALLBACK22-NEXT: popl %edi
-; FALLBACK22-NEXT: popl %ebx
-; FALLBACK22-NEXT: popl %ebp
-; FALLBACK22-NEXT: retl
-;
-; FALLBACK23-LABEL: ashr_32bytes:
-; FALLBACK23: # %bb.0:
-; FALLBACK23-NEXT: pushl %ebp
-; FALLBACK23-NEXT: pushl %ebx
-; FALLBACK23-NEXT: pushl %edi
-; FALLBACK23-NEXT: pushl %esi
-; FALLBACK23-NEXT: subl $108, %esp
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK23-NEXT: movups (%ecx), %xmm0
-; FALLBACK23-NEXT: movl 16(%ecx), %esi
-; FALLBACK23-NEXT: movl 20(%ecx), %edi
-; FALLBACK23-NEXT: movl 24(%ecx), %ebx
-; FALLBACK23-NEXT: movl 28(%ecx), %edx
-; FALLBACK23-NEXT: movzbl (%eax), %eax
-; FALLBACK23-NEXT: movl %eax, %ecx
-; FALLBACK23-NEXT: shlb $3, %cl
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: sarl $31, %edx
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: andb $28, %al
-; FALLBACK23-NEXT: movzbl %al, %ebx
-; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi
-; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp
-; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax
-; FALLBACK23-NEXT: movl %eax, %edi
-; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi
-; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax
-; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp
-; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx
-; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx
-; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movl %ebx, 4(%eax)
-; FALLBACK23-NEXT: movl %ebp, 24(%eax)
-; FALLBACK23-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; FALLBACK23-NEXT: movl %ebx, 28(%eax)
-; FALLBACK23-NEXT: movl %esi, 16(%eax)
-; FALLBACK23-NEXT: movl %edi, 20(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK23-NEXT: movl %esi, 8(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK23-NEXT: movl %esi, 12(%eax)
-; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK23-NEXT: movl %edx, (%eax)
-; FALLBACK23-NEXT: addl $108, %esp
-; FALLBACK23-NEXT: popl %esi
-; FALLBACK23-NEXT: popl %edi
-; FALLBACK23-NEXT: popl %ebx
-; FALLBACK23-NEXT: popl %ebp
-; FALLBACK23-NEXT: retl
-;
-; FALLBACK24-LABEL: ashr_32bytes:
-; FALLBACK24: # %bb.0:
-; FALLBACK24-NEXT: pushl %ebp
-; FALLBACK24-NEXT: pushl %ebx
-; FALLBACK24-NEXT: pushl %edi
-; FALLBACK24-NEXT: pushl %esi
-; FALLBACK24-NEXT: subl $108, %esp
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK24-NEXT: movl 16(%ecx), %esi
-; FALLBACK24-NEXT: movl 20(%ecx), %edi
-; FALLBACK24-NEXT: movl 24(%ecx), %ebx
-; FALLBACK24-NEXT: movl 28(%ecx), %edx
-; FALLBACK24-NEXT: movzbl (%eax), %eax
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shlb $3, %cl
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: sarl $31, %edx
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: andb $28, %al
-; FALLBACK24-NEXT: movzbl %al, %edi
-; FALLBACK24-NEXT: movl 32(%esp,%edi), %eax
-; FALLBACK24-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: movl %ecx, %edx
-; FALLBACK24-NEXT: movb %cl, %dh
-; FALLBACK24-NEXT: notb %dl
-; FALLBACK24-NEXT: addl %esi, %esi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: orl %eax, %esi
-; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %eax
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: movl 48(%esp,%edi), %esi
-; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: addl %esi, %esi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: orl %eax, %esi
-; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi
-; FALLBACK24-NEXT: movl %esi, %eax
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: addl %ebx, %ebx
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %eax, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp
-; FALLBACK24-NEXT: movl %ebp, %eax
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx
-; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %eax, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: addl %ebp, %ebp
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %ebp
-; FALLBACK24-NEXT: orl %eax, %ebp
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: movl 60(%esp,%edi), %eax
-; FALLBACK24-NEXT: leal (%eax,%eax), %edi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %edi
-; FALLBACK24-NEXT: orl %ebx, %edi
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: addl %esi, %esi
-; FALLBACK24-NEXT: movl %edx, %ecx
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: orl %ebx, %esi
-; FALLBACK24-NEXT: movb %dh, %cl
-; FALLBACK24-NEXT: sarl %cl, %eax
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: movl %eax, 28(%ecx)
-; FALLBACK24-NEXT: movl %esi, 4(%ecx)
-; FALLBACK24-NEXT: movl %edi, 24(%ecx)
-; FALLBACK24-NEXT: movl %ebp, 16(%ecx)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: movl %eax, 20(%ecx)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: movl %eax, 8(%ecx)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: movl %eax, 12(%ecx)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: movl %eax, (%ecx)
-; FALLBACK24-NEXT: addl $108, %esp
-; FALLBACK24-NEXT: popl %esi
-; FALLBACK24-NEXT: popl %edi
-; FALLBACK24-NEXT: popl %ebx
-; FALLBACK24-NEXT: popl %ebp
-; FALLBACK24-NEXT: retl
-;
-; FALLBACK25-LABEL: ashr_32bytes:
-; FALLBACK25: # %bb.0:
-; FALLBACK25-NEXT: pushl %ebp
-; FALLBACK25-NEXT: pushl %ebx
-; FALLBACK25-NEXT: pushl %edi
-; FALLBACK25-NEXT: pushl %esi
-; FALLBACK25-NEXT: subl $108, %esp
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK25-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK25-NEXT: movl 16(%ecx), %esi
-; FALLBACK25-NEXT: movl 20(%ecx), %edi
-; FALLBACK25-NEXT: movl 24(%ecx), %ebx
-; FALLBACK25-NEXT: movl 28(%ecx), %edx
-; FALLBACK25-NEXT: movzbl (%eax), %eax
-; FALLBACK25-NEXT: movl %eax, %ecx
-; FALLBACK25-NEXT: shlb $3, %cl
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: sarl $31, %edx
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: andb $28, %al
-; FALLBACK25-NEXT: movzbl %al, %ebp
-; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi
-; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx
-; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi
-; FALLBACK25-NEXT: movl %edi, %esi
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK25-NEXT: movl %esi, 4(%ebp)
-; FALLBACK25-NEXT: movl %ebx, 24(%ebp)
-; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK25-NEXT: sarl %cl, %eax
-; FALLBACK25-NEXT: movl %eax, 28(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 16(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 20(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 8(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 12(%ebp)
-; FALLBACK25-NEXT: movl %edx, (%ebp)
-; FALLBACK25-NEXT: addl $108, %esp
-; FALLBACK25-NEXT: popl %esi
-; FALLBACK25-NEXT: popl %edi
-; FALLBACK25-NEXT: popl %ebx
-; FALLBACK25-NEXT: popl %ebp
-; FALLBACK25-NEXT: retl
-;
-; FALLBACK26-LABEL: ashr_32bytes:
-; FALLBACK26: # %bb.0:
-; FALLBACK26-NEXT: pushl %ebp
-; FALLBACK26-NEXT: pushl %ebx
-; FALLBACK26-NEXT: pushl %edi
-; FALLBACK26-NEXT: pushl %esi
-; FALLBACK26-NEXT: subl $108, %esp
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK26-NEXT: movl 16(%ecx), %esi
-; FALLBACK26-NEXT: movl 20(%ecx), %edi
-; FALLBACK26-NEXT: movl 24(%ecx), %ebx
-; FALLBACK26-NEXT: movl 28(%ecx), %edx
-; FALLBACK26-NEXT: movzbl (%eax), %ecx
-; FALLBACK26-NEXT: movl %ecx, %eax
-; FALLBACK26-NEXT: shlb $3, %al
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: sarl $31, %edx
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: andb $28, %cl
-; FALLBACK26-NEXT: movzbl %cl, %edi
-; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
-; FALLBACK26-NEXT: movl %eax, %edx
-; FALLBACK26-NEXT: notb %dl
-; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: addl %esi, %esi
-; FALLBACK26-NEXT: shlxl %edx, %esi, %esi
-; FALLBACK26-NEXT: orl %ecx, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: addl %ecx, %ecx
-; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi
-; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx
-; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx
-; FALLBACK26-NEXT: orl %ebx, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: addl %ecx, %ecx
-; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi
-; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx
-; FALLBACK26-NEXT: movl %eax, %ecx
-; FALLBACK26-NEXT: orl %ebx, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi
-; FALLBACK26-NEXT: leal (%esi,%esi), %ebx
-; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax
-; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx
-; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl %ecx, %eax
-; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK26-NEXT: addl %ebx, %ebx
-; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx
-; FALLBACK26-NEXT: orl %ebp, %ebx
-; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx
-; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi
-; FALLBACK26-NEXT: sarxl %eax, %edi, %eax
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %edx, %edi, %edi
-; FALLBACK26-NEXT: orl %ecx, %edi
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: addl %ecx, %ecx
-; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx
-; FALLBACK26-NEXT: orl %esi, %ecx
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK26-NEXT: movl %eax, 28(%edx)
-; FALLBACK26-NEXT: movl %ecx, 4(%edx)
-; FALLBACK26-NEXT: movl %edi, 24(%edx)
-; FALLBACK26-NEXT: movl %ebx, 16(%edx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 20(%edx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 8(%edx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 12(%edx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, (%edx)
-; FALLBACK26-NEXT: addl $108, %esp
-; FALLBACK26-NEXT: popl %esi
-; FALLBACK26-NEXT: popl %edi
-; FALLBACK26-NEXT: popl %ebx
-; FALLBACK26-NEXT: popl %ebp
-; FALLBACK26-NEXT: retl
-;
-; FALLBACK27-LABEL: ashr_32bytes:
-; FALLBACK27: # %bb.0:
-; FALLBACK27-NEXT: pushl %ebp
-; FALLBACK27-NEXT: pushl %ebx
-; FALLBACK27-NEXT: pushl %edi
-; FALLBACK27-NEXT: pushl %esi
-; FALLBACK27-NEXT: subl $108, %esp
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK27-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK27-NEXT: movl 16(%ecx), %esi
-; FALLBACK27-NEXT: movl 20(%ecx), %edi
-; FALLBACK27-NEXT: movl 24(%ecx), %ebx
-; FALLBACK27-NEXT: movl 28(%ecx), %edx
-; FALLBACK27-NEXT: movzbl (%eax), %eax
-; FALLBACK27-NEXT: movl %eax, %ecx
-; FALLBACK27-NEXT: shlb $3, %cl
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: sarl $31, %edx
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: andb $28, %al
-; FALLBACK27-NEXT: movzbl %al, %ebx
-; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi
-; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp
-; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax
-; FALLBACK27-NEXT: movl %eax, %edi
-; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi
-; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax
-; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp
-; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx
-; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx
-; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: movl %ebx, 4(%eax)
-; FALLBACK27-NEXT: movl %ebp, 24(%eax)
-; FALLBACK27-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; FALLBACK27-NEXT: movl %ebx, 28(%eax)
-; FALLBACK27-NEXT: movl %esi, 16(%eax)
-; FALLBACK27-NEXT: movl %edi, 20(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK27-NEXT: movl %esi, 8(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK27-NEXT: movl %esi, 12(%eax)
-; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK27-NEXT: movl %edx, (%eax)
-; FALLBACK27-NEXT: addl $108, %esp
-; FALLBACK27-NEXT: popl %esi
-; FALLBACK27-NEXT: popl %edi
-; FALLBACK27-NEXT: popl %ebx
-; FALLBACK27-NEXT: popl %ebp
-; FALLBACK27-NEXT: retl
-;
-; FALLBACK28-LABEL: ashr_32bytes:
-; FALLBACK28: # %bb.0:
-; FALLBACK28-NEXT: pushl %ebp
-; FALLBACK28-NEXT: pushl %ebx
-; FALLBACK28-NEXT: pushl %edi
-; FALLBACK28-NEXT: pushl %esi
-; FALLBACK28-NEXT: subl $108, %esp
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK28-NEXT: movl 16(%ecx), %esi
-; FALLBACK28-NEXT: movl 20(%ecx), %edi
-; FALLBACK28-NEXT: movl 24(%ecx), %ebx
-; FALLBACK28-NEXT: movl 28(%ecx), %edx
-; FALLBACK28-NEXT: movzbl (%eax), %eax
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shlb $3, %cl
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: sarl $31, %edx
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: andb $28, %al
-; FALLBACK28-NEXT: movzbl %al, %edi
-; FALLBACK28-NEXT: movl 32(%esp,%edi), %eax
-; FALLBACK28-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: movl %ecx, %edx
-; FALLBACK28-NEXT: movb %cl, %dh
-; FALLBACK28-NEXT: notb %dl
-; FALLBACK28-NEXT: addl %esi, %esi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: orl %eax, %esi
-; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %eax
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: movl 48(%esp,%edi), %esi
-; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: addl %esi, %esi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: orl %eax, %esi
-; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi
-; FALLBACK28-NEXT: movl %esi, %eax
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: addl %ebx, %ebx
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %eax, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp
-; FALLBACK28-NEXT: movl %ebp, %eax
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx
-; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %eax, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: addl %ebp, %ebp
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %ebp
-; FALLBACK28-NEXT: orl %eax, %ebp
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: movl 60(%esp,%edi), %eax
-; FALLBACK28-NEXT: leal (%eax,%eax), %edi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %edi
-; FALLBACK28-NEXT: orl %ebx, %edi
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: addl %esi, %esi
-; FALLBACK28-NEXT: movl %edx, %ecx
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: orl %ebx, %esi
-; FALLBACK28-NEXT: movb %dh, %cl
-; FALLBACK28-NEXT: sarl %cl, %eax
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: movl %eax, 28(%ecx)
-; FALLBACK28-NEXT: movl %esi, 4(%ecx)
-; FALLBACK28-NEXT: movl %edi, 24(%ecx)
-; FALLBACK28-NEXT: movl %ebp, 16(%ecx)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: movl %eax, 20(%ecx)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: movl %eax, 8(%ecx)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: movl %eax, 12(%ecx)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: movl %eax, (%ecx)
-; FALLBACK28-NEXT: addl $108, %esp
-; FALLBACK28-NEXT: popl %esi
-; FALLBACK28-NEXT: popl %edi
-; FALLBACK28-NEXT: popl %ebx
-; FALLBACK28-NEXT: popl %ebp
-; FALLBACK28-NEXT: retl
-;
-; FALLBACK29-LABEL: ashr_32bytes:
-; FALLBACK29: # %bb.0:
-; FALLBACK29-NEXT: pushl %ebp
-; FALLBACK29-NEXT: pushl %ebx
-; FALLBACK29-NEXT: pushl %edi
-; FALLBACK29-NEXT: pushl %esi
-; FALLBACK29-NEXT: subl $108, %esp
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK29-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK29-NEXT: movl 16(%ecx), %esi
-; FALLBACK29-NEXT: movl 20(%ecx), %edi
-; FALLBACK29-NEXT: movl 24(%ecx), %ebx
-; FALLBACK29-NEXT: movl 28(%ecx), %edx
-; FALLBACK29-NEXT: movzbl (%eax), %eax
-; FALLBACK29-NEXT: movl %eax, %ecx
-; FALLBACK29-NEXT: shlb $3, %cl
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: sarl $31, %edx
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: andb $28, %al
-; FALLBACK29-NEXT: movzbl %al, %ebp
-; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi
-; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx
-; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi
-; FALLBACK29-NEXT: movl %edi, %esi
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK29-NEXT: movl %esi, 4(%ebp)
-; FALLBACK29-NEXT: movl %ebx, 24(%ebp)
-; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK29-NEXT: sarl %cl, %eax
-; FALLBACK29-NEXT: movl %eax, 28(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 16(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 20(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 8(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 12(%ebp)
-; FALLBACK29-NEXT: movl %edx, (%ebp)
-; FALLBACK29-NEXT: addl $108, %esp
-; FALLBACK29-NEXT: popl %esi
-; FALLBACK29-NEXT: popl %edi
-; FALLBACK29-NEXT: popl %ebx
-; FALLBACK29-NEXT: popl %ebp
-; FALLBACK29-NEXT: retl
-;
-; FALLBACK30-LABEL: ashr_32bytes:
-; FALLBACK30: # %bb.0:
-; FALLBACK30-NEXT: pushl %ebp
-; FALLBACK30-NEXT: pushl %ebx
-; FALLBACK30-NEXT: pushl %edi
-; FALLBACK30-NEXT: pushl %esi
-; FALLBACK30-NEXT: subl $108, %esp
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK30-NEXT: movl 16(%ecx), %esi
-; FALLBACK30-NEXT: movl 20(%ecx), %edi
-; FALLBACK30-NEXT: movl 24(%ecx), %ebx
-; FALLBACK30-NEXT: movl 28(%ecx), %edx
-; FALLBACK30-NEXT: movzbl (%eax), %ecx
-; FALLBACK30-NEXT: movl %ecx, %eax
-; FALLBACK30-NEXT: shlb $3, %al
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: sarl $31, %edx
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: andb $28, %cl
-; FALLBACK30-NEXT: movzbl %cl, %edi
-; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx
-; FALLBACK30-NEXT: movl %eax, %edx
-; FALLBACK30-NEXT: notb %dl
-; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: addl %esi, %esi
-; FALLBACK30-NEXT: shlxl %edx, %esi, %esi
-; FALLBACK30-NEXT: orl %ecx, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: addl %ecx, %ecx
-; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi
-; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx
-; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx
-; FALLBACK30-NEXT: orl %ebx, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: addl %ecx, %ecx
-; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi
-; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx
-; FALLBACK30-NEXT: movl %eax, %ecx
-; FALLBACK30-NEXT: orl %ebx, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi
-; FALLBACK30-NEXT: leal (%esi,%esi), %ebx
-; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax
-; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx
-; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl %ecx, %eax
-; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; FALLBACK30-NEXT: addl %ebx, %ebx
-; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx
-; FALLBACK30-NEXT: orl %ebp, %ebx
-; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx
-; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi
-; FALLBACK30-NEXT: sarxl %eax, %edi, %eax
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %edx, %edi, %edi
-; FALLBACK30-NEXT: orl %ecx, %edi
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: addl %ecx, %ecx
-; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx
-; FALLBACK30-NEXT: orl %esi, %ecx
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx
-; FALLBACK30-NEXT: movl %eax, 28(%edx)
-; FALLBACK30-NEXT: movl %ecx, 4(%edx)
-; FALLBACK30-NEXT: movl %edi, 24(%edx)
-; FALLBACK30-NEXT: movl %ebx, 16(%edx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 20(%edx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 8(%edx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 12(%edx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, (%edx)
-; FALLBACK30-NEXT: addl $108, %esp
-; FALLBACK30-NEXT: popl %esi
-; FALLBACK30-NEXT: popl %edi
-; FALLBACK30-NEXT: popl %ebx
-; FALLBACK30-NEXT: popl %ebp
-; FALLBACK30-NEXT: retl
-;
-; FALLBACK31-LABEL: ashr_32bytes:
-; FALLBACK31: # %bb.0:
-; FALLBACK31-NEXT: pushl %ebp
-; FALLBACK31-NEXT: pushl %ebx
-; FALLBACK31-NEXT: pushl %edi
-; FALLBACK31-NEXT: pushl %esi
-; FALLBACK31-NEXT: subl $108, %esp
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK31-NEXT: vmovups (%ecx), %xmm0
-; FALLBACK31-NEXT: movl 16(%ecx), %esi
-; FALLBACK31-NEXT: movl 20(%ecx), %edi
-; FALLBACK31-NEXT: movl 24(%ecx), %ebx
-; FALLBACK31-NEXT: movl 28(%ecx), %edx
-; FALLBACK31-NEXT: movzbl (%eax), %eax
-; FALLBACK31-NEXT: movl %eax, %ecx
-; FALLBACK31-NEXT: shlb $3, %cl
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: sarl $31, %edx
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: andb $28, %al
-; FALLBACK31-NEXT: movzbl %al, %ebx
-; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi
-; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp
-; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax
-; FALLBACK31-NEXT: movl %eax, %edi
-; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi
-; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax
-; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp
-; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx
-; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx
-; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: movl %ebx, 4(%eax)
-; FALLBACK31-NEXT: movl %ebp, 24(%eax)
-; FALLBACK31-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; FALLBACK31-NEXT: movl %ebx, 28(%eax)
-; FALLBACK31-NEXT: movl %esi, 16(%eax)
-; FALLBACK31-NEXT: movl %edi, 20(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK31-NEXT: movl %esi, 8(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK31-NEXT: movl %esi, 12(%eax)
-; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK31-NEXT: movl %edx, (%eax)
-; FALLBACK31-NEXT: addl $108, %esp
-; FALLBACK31-NEXT: popl %esi
-; FALLBACK31-NEXT: popl %edi
-; FALLBACK31-NEXT: popl %ebx
-; FALLBACK31-NEXT: popl %ebp
-; FALLBACK31-NEXT: retl
+; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
+; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
+; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %r9d
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rsi,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes:
+; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $24, %sil
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %r9d
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, -72(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rsi,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $24, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %dl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%edi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%eax), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%eax,%eax), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 16(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 20(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 8(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $28, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %al, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%edx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %dl, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%ebp,%ebp), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, 32(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%esi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %eax, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%esi), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%esi), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ebp, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 28(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 20(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edi)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $28, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %al, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ecx, %edi, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $92, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
+; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %cl, %dh
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 24(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 16(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, (%ecx)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%ecx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%ecx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%ecx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $28, %al
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %al, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%ecx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %dl, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %eax, 32(%esp,%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 20(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%ecx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%ecx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%ecx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $28, %al
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %al, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes:
+; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 16(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 20(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 24(%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 28(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %cl, %dh
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %dl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%edi), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%edi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dh, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 24(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 16(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, (%ecx)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 16(%ecx), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 20(%ecx), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 24(%ecx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 28(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $28, %al
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %al, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 44(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 40(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 32(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 36(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %ebp, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $108, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%ecx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %bl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %dl, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %eax, 32(%esp,%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %bl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebp, %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edx, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %edx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebp, %edx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%ecx,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 20(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 16(%ecx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 20(%ecx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 24(%ecx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 28(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $3, %cl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $28, %al
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %al, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 44(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 40(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 32(%esp,%ebx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 36(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $108, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl
%src = load i256, ptr %src.ptr, align 1
%byteOff = load i256, ptr %byteOff.ptr, align 1
%bitOff = shl i256 %byteOff, 3
@@ -10996,663 +8868,500 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
}
define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind {
-; FALLBACK0-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK0: # %bb.0:
-; FALLBACK0-NEXT: pushq %rbx
-; FALLBACK0-NEXT: movq (%rdi), %rcx
-; FALLBACK0-NEXT: movq 8(%rdi), %r8
-; FALLBACK0-NEXT: movq 16(%rdi), %r9
-; FALLBACK0-NEXT: movq 24(%rdi), %rdi
-; FALLBACK0-NEXT: movzbl (%rsi), %esi
-; FALLBACK0-NEXT: movl %esi, %eax
-; FALLBACK0-NEXT: shlb $5, %al
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: sarq $63, %rdi
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: andb $6, %sil
-; FALLBACK0-NEXT: movzbl %sil, %r9d
-; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10
-; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi
-; FALLBACK0-NEXT: movq %rdi, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r11
-; FALLBACK0-NEXT: movl %eax, %esi
-; FALLBACK0-NEXT: notb %sil
-; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx
-; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r8
-; FALLBACK0-NEXT: orq %r11, %r8
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r10
-; FALLBACK0-NEXT: addq %rdi, %rdi
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %rdi
-; FALLBACK0-NEXT: orq %r10, %rdi
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rbx
-; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9
-; FALLBACK0-NEXT: leaq (%r9,%r9), %r10
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r10
-; FALLBACK0-NEXT: orq %rbx, %r10
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: sarq %cl, %r9
-; FALLBACK0-NEXT: movq %r9, 24(%rdx)
-; FALLBACK0-NEXT: movq %r10, 16(%rdx)
-; FALLBACK0-NEXT: movq %rdi, (%rdx)
-; FALLBACK0-NEXT: movq %r8, 8(%rdx)
-; FALLBACK0-NEXT: popq %rbx
-; FALLBACK0-NEXT: retq
-;
-; FALLBACK1-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK1: # %bb.0:
-; FALLBACK1-NEXT: movq (%rdi), %rax
-; FALLBACK1-NEXT: movq 8(%rdi), %r8
-; FALLBACK1-NEXT: movq 16(%rdi), %r9
-; FALLBACK1-NEXT: movq 24(%rdi), %rdi
-; FALLBACK1-NEXT: movzbl (%rsi), %esi
-; FALLBACK1-NEXT: movl %esi, %ecx
-; FALLBACK1-NEXT: shlb $5, %cl
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: sarq $63, %rdi
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: andb $6, %sil
-; FALLBACK1-NEXT: movzbl %sil, %eax
-; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi
-; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi
-; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8
-; FALLBACK1-NEXT: movq %r8, %r9
-; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9
-; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax
-; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi
-; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi
-; FALLBACK1-NEXT: sarq %cl, %rax
-; FALLBACK1-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK1-NEXT: movq %rax, 24(%rdx)
-; FALLBACK1-NEXT: movq %rdi, (%rdx)
-; FALLBACK1-NEXT: movq %r9, 8(%rdx)
-; FALLBACK1-NEXT: retq
-;
-; FALLBACK2-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK2: # %bb.0:
-; FALLBACK2-NEXT: movq (%rdi), %rcx
-; FALLBACK2-NEXT: movq 8(%rdi), %r8
-; FALLBACK2-NEXT: movq 16(%rdi), %r9
-; FALLBACK2-NEXT: movq 24(%rdi), %rdi
-; FALLBACK2-NEXT: movzbl (%rsi), %esi
-; FALLBACK2-NEXT: movl %esi, %eax
-; FALLBACK2-NEXT: shlb $5, %al
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: sarq $63, %rdi
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: andb $6, %sil
-; FALLBACK2-NEXT: movzbl %sil, %ecx
-; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi
-; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi
-; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8
-; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9
-; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11
-; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK2-NEXT: notb %al
-; FALLBACK2-NEXT: addq %rdi, %rdi
-; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK2-NEXT: orq %r8, %rdi
-; FALLBACK2-NEXT: addq %rsi, %rsi
-; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi
-; FALLBACK2-NEXT: orq %r9, %rsi
-; FALLBACK2-NEXT: addq %rcx, %rcx
-; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax
-; FALLBACK2-NEXT: orq %r10, %rax
-; FALLBACK2-NEXT: movq %r11, 24(%rdx)
-; FALLBACK2-NEXT: movq %rax, 16(%rdx)
-; FALLBACK2-NEXT: movq %rsi, (%rdx)
-; FALLBACK2-NEXT: movq %rdi, 8(%rdx)
-; FALLBACK2-NEXT: retq
-;
-; FALLBACK3-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK3: # %bb.0:
-; FALLBACK3-NEXT: movq (%rdi), %rax
-; FALLBACK3-NEXT: movq 8(%rdi), %r8
-; FALLBACK3-NEXT: movq 16(%rdi), %r9
-; FALLBACK3-NEXT: movq 24(%rdi), %rdi
-; FALLBACK3-NEXT: movzbl (%rsi), %esi
-; FALLBACK3-NEXT: movl %esi, %ecx
-; FALLBACK3-NEXT: shlb $5, %cl
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: sarq $63, %rdi
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: andb $6, %sil
-; FALLBACK3-NEXT: movzbl %sil, %eax
-; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi
-; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi
-; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8
-; FALLBACK3-NEXT: movq %r8, %r9
-; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9
-; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax
-; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi
-; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi
-; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
-; FALLBACK3-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK3-NEXT: movq %rax, 24(%rdx)
-; FALLBACK3-NEXT: movq %rdi, (%rdx)
-; FALLBACK3-NEXT: movq %r9, 8(%rdx)
-; FALLBACK3-NEXT: retq
-;
-; FALLBACK4-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK4: # %bb.0:
-; FALLBACK4-NEXT: pushq %rbx
-; FALLBACK4-NEXT: movups (%rdi), %xmm0
-; FALLBACK4-NEXT: movq 16(%rdi), %rcx
-; FALLBACK4-NEXT: movq 24(%rdi), %rdi
-; FALLBACK4-NEXT: movzbl (%rsi), %esi
-; FALLBACK4-NEXT: movl %esi, %eax
-; FALLBACK4-NEXT: shlb $5, %al
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: sarq $63, %rdi
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: andb $6, %sil
-; FALLBACK4-NEXT: movzbl %sil, %r9d
-; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10
-; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r10
-; FALLBACK4-NEXT: movl %eax, %esi
-; FALLBACK4-NEXT: notb %sil
-; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rdi
-; FALLBACK4-NEXT: orq %r10, %rdi
-; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10
-; FALLBACK4-NEXT: movq %r10, %r11
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r11
-; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9
-; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rbx
-; FALLBACK4-NEXT: orq %r11, %rbx
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r8
-; FALLBACK4-NEXT: addq %r10, %r10
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r10
-; FALLBACK4-NEXT: orq %r8, %r10
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: sarq %cl, %r9
-; FALLBACK4-NEXT: movq %r9, 24(%rdx)
-; FALLBACK4-NEXT: movq %r10, 8(%rdx)
-; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK4-NEXT: movq %rdi, (%rdx)
-; FALLBACK4-NEXT: popq %rbx
-; FALLBACK4-NEXT: retq
-;
-; FALLBACK5-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK5: # %bb.0:
-; FALLBACK5-NEXT: movups (%rdi), %xmm0
-; FALLBACK5-NEXT: movq 16(%rdi), %rax
-; FALLBACK5-NEXT: movq 24(%rdi), %rdi
-; FALLBACK5-NEXT: movzbl (%rsi), %esi
-; FALLBACK5-NEXT: movl %esi, %ecx
-; FALLBACK5-NEXT: shlb $5, %cl
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: sarq $63, %rdi
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: andb $6, %sil
-; FALLBACK5-NEXT: movzbl %sil, %eax
-; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK5-NEXT: movq %rdi, %r8
-; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK5-NEXT: movq %rax, %r10
-; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK5-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK5-NEXT: sarq %cl, %rsi
-; FALLBACK5-NEXT: movq %r10, 8(%rdx)
-; FALLBACK5-NEXT: movq %r8, 16(%rdx)
-; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK5-NEXT: movq %r9, (%rdx)
-; FALLBACK5-NEXT: retq
-;
-; FALLBACK6-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK6: # %bb.0:
-; FALLBACK6-NEXT: movups (%rdi), %xmm0
-; FALLBACK6-NEXT: movq 16(%rdi), %rcx
-; FALLBACK6-NEXT: movq 24(%rdi), %rdi
-; FALLBACK6-NEXT: movzbl (%rsi), %esi
-; FALLBACK6-NEXT: movl %esi, %eax
-; FALLBACK6-NEXT: shlb $5, %al
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: sarq $63, %rdi
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: andb $6, %sil
-; FALLBACK6-NEXT: movzbl %sil, %ecx
-; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8
-; FALLBACK6-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11
-; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK6-NEXT: notb %al
-; FALLBACK6-NEXT: addq %rdi, %rdi
-; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK6-NEXT: orq %rsi, %rdi
-; FALLBACK6-NEXT: addq %rcx, %rcx
-; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK6-NEXT: orq %r9, %rcx
-; FALLBACK6-NEXT: addq %r8, %r8
-; FALLBACK6-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK6-NEXT: orq %r10, %rax
-; FALLBACK6-NEXT: movq %r11, 24(%rdx)
-; FALLBACK6-NEXT: movq %rax, 8(%rdx)
-; FALLBACK6-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK6-NEXT: movq %rdi, (%rdx)
-; FALLBACK6-NEXT: retq
-;
-; FALLBACK7-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK7: # %bb.0:
-; FALLBACK7-NEXT: movups (%rdi), %xmm0
-; FALLBACK7-NEXT: movq 16(%rdi), %rax
-; FALLBACK7-NEXT: movq 24(%rdi), %rdi
-; FALLBACK7-NEXT: movzbl (%rsi), %esi
-; FALLBACK7-NEXT: movl %esi, %ecx
-; FALLBACK7-NEXT: shlb $5, %cl
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: sarq $63, %rdi
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: andb $6, %sil
-; FALLBACK7-NEXT: movzbl %sil, %eax
-; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK7-NEXT: movq %rdi, %r8
-; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK7-NEXT: movq %rax, %r10
-; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK7-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax
-; FALLBACK7-NEXT: movq %r10, 8(%rdx)
-; FALLBACK7-NEXT: movq %r8, 16(%rdx)
-; FALLBACK7-NEXT: movq %rax, 24(%rdx)
-; FALLBACK7-NEXT: movq %r9, (%rdx)
-; FALLBACK7-NEXT: retq
-;
-; FALLBACK8-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK8: # %bb.0:
-; FALLBACK8-NEXT: pushq %rbx
-; FALLBACK8-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK8-NEXT: movq 16(%rdi), %rcx
-; FALLBACK8-NEXT: movq 24(%rdi), %rdi
-; FALLBACK8-NEXT: movzbl (%rsi), %esi
-; FALLBACK8-NEXT: movl %esi, %eax
-; FALLBACK8-NEXT: shlb $5, %al
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: sarq $63, %rdi
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: andb $6, %sil
-; FALLBACK8-NEXT: movzbl %sil, %r9d
-; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10
-; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r10
-; FALLBACK8-NEXT: movl %eax, %esi
-; FALLBACK8-NEXT: notb %sil
-; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rdi
-; FALLBACK8-NEXT: orq %r10, %rdi
-; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10
-; FALLBACK8-NEXT: movq %r10, %r11
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r11
-; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9
-; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rbx
-; FALLBACK8-NEXT: orq %r11, %rbx
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r8
-; FALLBACK8-NEXT: addq %r10, %r10
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r10
-; FALLBACK8-NEXT: orq %r8, %r10
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: sarq %cl, %r9
-; FALLBACK8-NEXT: movq %r9, 24(%rdx)
-; FALLBACK8-NEXT: movq %r10, 8(%rdx)
-; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK8-NEXT: movq %rdi, (%rdx)
-; FALLBACK8-NEXT: popq %rbx
-; FALLBACK8-NEXT: retq
-;
-; FALLBACK9-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK9: # %bb.0:
-; FALLBACK9-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK9-NEXT: movq 16(%rdi), %rax
-; FALLBACK9-NEXT: movq 24(%rdi), %rdi
-; FALLBACK9-NEXT: movzbl (%rsi), %esi
-; FALLBACK9-NEXT: movl %esi, %ecx
-; FALLBACK9-NEXT: shlb $5, %cl
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: sarq $63, %rdi
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: andb $6, %sil
-; FALLBACK9-NEXT: movzbl %sil, %eax
-; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK9-NEXT: movq %rdi, %r8
-; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK9-NEXT: movq %rax, %r10
-; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK9-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK9-NEXT: sarq %cl, %rsi
-; FALLBACK9-NEXT: movq %r10, 8(%rdx)
-; FALLBACK9-NEXT: movq %r8, 16(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK9-NEXT: movq %r9, (%rdx)
-; FALLBACK9-NEXT: retq
-;
-; FALLBACK10-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK10: # %bb.0:
-; FALLBACK10-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK10-NEXT: movq 16(%rdi), %rcx
-; FALLBACK10-NEXT: movq 24(%rdi), %rdi
-; FALLBACK10-NEXT: movzbl (%rsi), %esi
-; FALLBACK10-NEXT: movl %esi, %eax
-; FALLBACK10-NEXT: shlb $5, %al
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: sarq $63, %rdi
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: andb $6, %sil
-; FALLBACK10-NEXT: movzbl %sil, %ecx
-; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8
-; FALLBACK10-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11
-; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK10-NEXT: notb %al
-; FALLBACK10-NEXT: addq %rdi, %rdi
-; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK10-NEXT: orq %rsi, %rdi
-; FALLBACK10-NEXT: addq %rcx, %rcx
-; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK10-NEXT: orq %r9, %rcx
-; FALLBACK10-NEXT: addq %r8, %r8
-; FALLBACK10-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK10-NEXT: orq %r10, %rax
-; FALLBACK10-NEXT: movq %r11, 24(%rdx)
-; FALLBACK10-NEXT: movq %rax, 8(%rdx)
-; FALLBACK10-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK10-NEXT: movq %rdi, (%rdx)
-; FALLBACK10-NEXT: retq
-;
-; FALLBACK11-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK11: # %bb.0:
-; FALLBACK11-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK11-NEXT: movq 16(%rdi), %rax
-; FALLBACK11-NEXT: movq 24(%rdi), %rdi
-; FALLBACK11-NEXT: movzbl (%rsi), %esi
-; FALLBACK11-NEXT: movl %esi, %ecx
-; FALLBACK11-NEXT: shlb $5, %cl
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: sarq $63, %rdi
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: andb $6, %sil
-; FALLBACK11-NEXT: movzbl %sil, %eax
-; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK11-NEXT: movq %rdi, %r8
-; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK11-NEXT: movq %rax, %r10
-; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK11-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax
-; FALLBACK11-NEXT: movq %r10, 8(%rdx)
-; FALLBACK11-NEXT: movq %r8, 16(%rdx)
-; FALLBACK11-NEXT: movq %rax, 24(%rdx)
-; FALLBACK11-NEXT: movq %r9, (%rdx)
-; FALLBACK11-NEXT: retq
-;
-; FALLBACK12-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK12: # %bb.0:
-; FALLBACK12-NEXT: pushq %rbx
-; FALLBACK12-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK12-NEXT: movq 16(%rdi), %rcx
-; FALLBACK12-NEXT: movq 24(%rdi), %rdi
-; FALLBACK12-NEXT: movzbl (%rsi), %esi
-; FALLBACK12-NEXT: movl %esi, %eax
-; FALLBACK12-NEXT: shlb $5, %al
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: sarq $63, %rdi
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: andb $6, %sil
-; FALLBACK12-NEXT: movzbl %sil, %r9d
-; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10
-; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: movl %eax, %esi
-; FALLBACK12-NEXT: notb %sil
-; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rdi
-; FALLBACK12-NEXT: orq %r10, %rdi
-; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10
-; FALLBACK12-NEXT: movq %r10, %r11
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r11
-; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9
-; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rbx
-; FALLBACK12-NEXT: orq %r11, %rbx
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r8
-; FALLBACK12-NEXT: addq %r10, %r10
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r10
-; FALLBACK12-NEXT: orq %r8, %r10
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: sarq %cl, %r9
-; FALLBACK12-NEXT: movq %r9, 24(%rdx)
-; FALLBACK12-NEXT: movq %r10, 8(%rdx)
-; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK12-NEXT: movq %rdi, (%rdx)
-; FALLBACK12-NEXT: popq %rbx
-; FALLBACK12-NEXT: retq
-;
-; FALLBACK13-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK13: # %bb.0:
-; FALLBACK13-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK13-NEXT: movq 16(%rdi), %rax
-; FALLBACK13-NEXT: movq 24(%rdi), %rdi
-; FALLBACK13-NEXT: movzbl (%rsi), %esi
-; FALLBACK13-NEXT: movl %esi, %ecx
-; FALLBACK13-NEXT: shlb $5, %cl
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: sarq $63, %rdi
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: andb $6, %sil
-; FALLBACK13-NEXT: movzbl %sil, %eax
-; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK13-NEXT: movq %rdi, %r8
-; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK13-NEXT: movq %rax, %r10
-; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK13-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK13-NEXT: sarq %cl, %rsi
-; FALLBACK13-NEXT: movq %r10, 8(%rdx)
-; FALLBACK13-NEXT: movq %r8, 16(%rdx)
-; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK13-NEXT: movq %r9, (%rdx)
-; FALLBACK13-NEXT: retq
-;
-; FALLBACK14-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK14: # %bb.0:
-; FALLBACK14-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK14-NEXT: movq 16(%rdi), %rcx
-; FALLBACK14-NEXT: movq 24(%rdi), %rdi
-; FALLBACK14-NEXT: movzbl (%rsi), %esi
-; FALLBACK14-NEXT: movl %esi, %eax
-; FALLBACK14-NEXT: shlb $5, %al
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: sarq $63, %rdi
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: andb $6, %sil
-; FALLBACK14-NEXT: movzbl %sil, %ecx
-; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi
-; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi
-; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8
-; FALLBACK14-NEXT: shrxq %rax, %r8, %r9
-; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx
-; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10
-; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11
-; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax
-; FALLBACK14-NEXT: notb %al
-; FALLBACK14-NEXT: addq %rdi, %rdi
-; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi
-; FALLBACK14-NEXT: orq %rsi, %rdi
-; FALLBACK14-NEXT: addq %rcx, %rcx
-; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx
-; FALLBACK14-NEXT: orq %r9, %rcx
-; FALLBACK14-NEXT: addq %r8, %r8
-; FALLBACK14-NEXT: shlxq %rax, %r8, %rax
-; FALLBACK14-NEXT: orq %r10, %rax
-; FALLBACK14-NEXT: movq %r11, 24(%rdx)
-; FALLBACK14-NEXT: movq %rax, 8(%rdx)
-; FALLBACK14-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK14-NEXT: movq %rdi, (%rdx)
-; FALLBACK14-NEXT: retq
-;
-; FALLBACK15-LABEL: ashr_32bytes_dwordOff:
-; FALLBACK15: # %bb.0:
-; FALLBACK15-NEXT: vmovups (%rdi), %xmm0
-; FALLBACK15-NEXT: movq 16(%rdi), %rax
-; FALLBACK15-NEXT: movq 24(%rdi), %rdi
-; FALLBACK15-NEXT: movzbl (%rsi), %esi
-; FALLBACK15-NEXT: movl %esi, %ecx
-; FALLBACK15-NEXT: shlb $5, %cl
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: sarq $63, %rdi
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: andb $6, %sil
-; FALLBACK15-NEXT: movzbl %sil, %eax
-; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi
-; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi
-; FALLBACK15-NEXT: movq %rdi, %r8
-; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8
-; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9
-; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax
-; FALLBACK15-NEXT: movq %rax, %r10
-; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10
-; FALLBACK15-NEXT: shrdq %cl, %rax, %r9
-; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax
-; FALLBACK15-NEXT: movq %r10, 8(%rdx)
-; FALLBACK15-NEXT: movq %r8, 16(%rdx)
-; FALLBACK15-NEXT: movq %rax, 24(%rdx)
-; FALLBACK15-NEXT: movq %r9, (%rdx)
-; FALLBACK15-NEXT: retq
+; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %r9d
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9,4), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9,4), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rdi, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r9,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andb $6, %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rdi, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rsi,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_32bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andb $6, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rax,4), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rsi, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r8, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %r9d
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 16(%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andb $6, %sil
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %sil
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rsi,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_32bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 16(%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andb $6, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
+; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %eax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %al
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andb $6, %sil
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %r9d
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%r9,4), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%r9,4), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -40(%rsp,%r9,4), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r11, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r8, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
+; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 16(%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andb $6, %sil
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
+; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %sil
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, -72(%rsp,%rsi,4), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rsi,4), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rsi,4), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r8,%r8), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rsi,4), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rsi,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rcx, %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rdi, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_32bytes_dwordOff:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 16(%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 24(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shlb $5, %cl
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andb $6, %sil
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movzbl %sil, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -48(%rsp,%rax,4), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -56(%rsp,%rax,4), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rsi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax,4), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -64(%rsp,%rax,4), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %rsi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
;
; X86-SSE2-LABEL: ashr_32bytes_dwordOff:
; X86-SSE2: # %bb.0:
@@ -12035,3644 +9744,3629 @@ define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
}
define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; FALLBACK0-LABEL: lshr_64bytes:
-; FALLBACK0: # %bb.0:
-; FALLBACK0-NEXT: pushq %r15
-; FALLBACK0-NEXT: pushq %r14
-; FALLBACK0-NEXT: pushq %r13
-; FALLBACK0-NEXT: pushq %r12
-; FALLBACK0-NEXT: pushq %rbx
-; FALLBACK0-NEXT: movq (%rdi), %rax
-; FALLBACK0-NEXT: movq 8(%rdi), %rcx
-; FALLBACK0-NEXT: movq 16(%rdi), %r8
-; FALLBACK0-NEXT: movq 24(%rdi), %r9
-; FALLBACK0-NEXT: movq 32(%rdi), %r10
-; FALLBACK0-NEXT: movq 40(%rdi), %r11
-; FALLBACK0-NEXT: movq 48(%rdi), %rbx
-; FALLBACK0-NEXT: movq 56(%rdi), %r14
-; FALLBACK0-NEXT: movl (%rsi), %edi
-; FALLBACK0-NEXT: xorps %xmm0, %xmm0
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: leal (,%rdi,8), %eax
-; FALLBACK0-NEXT: andl $56, %eax
-; FALLBACK0-NEXT: andl $56, %edi
-; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10
-; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8
-; FALLBACK0-NEXT: movq %r8, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r11
-; FALLBACK0-NEXT: movl %eax, %esi
-; FALLBACK0-NEXT: notb %sil
-; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx
-; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r9
-; FALLBACK0-NEXT: orq %r11, %r9
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r10
-; FALLBACK0-NEXT: addq %r8, %r8
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r8
-; FALLBACK0-NEXT: orq %r10, %r8
-; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10
-; FALLBACK0-NEXT: movq %r10, %r15
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r15
-; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14
-; FALLBACK0-NEXT: leaq (%r14,%r14), %r11
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r11
-; FALLBACK0-NEXT: orq %r15, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rbx
-; FALLBACK0-NEXT: addq %r10, %r10
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r10
-; FALLBACK0-NEXT: orq %rbx, %r10
-; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx
-; FALLBACK0-NEXT: movq %rbx, %r12
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r12
-; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13
-; FALLBACK0-NEXT: leaq (%r13,%r13), %r15
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r15
-; FALLBACK0-NEXT: orq %r12, %r15
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r14
-; FALLBACK0-NEXT: addq %rbx, %rbx
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %rbx
-; FALLBACK0-NEXT: orq %r14, %rbx
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r13
-; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi
-; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r14
-; FALLBACK0-NEXT: orq %r13, %r14
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rdi
-; FALLBACK0-NEXT: movq %rdi, 56(%rdx)
-; FALLBACK0-NEXT: movq %r14, 48(%rdx)
-; FALLBACK0-NEXT: movq %rbx, 32(%rdx)
-; FALLBACK0-NEXT: movq %r15, 40(%rdx)
-; FALLBACK0-NEXT: movq %r10, 16(%rdx)
-; FALLBACK0-NEXT: movq %r11, 24(%rdx)
-; FALLBACK0-NEXT: movq %r8, (%rdx)
-; FALLBACK0-NEXT: movq %r9, 8(%rdx)
-; FALLBACK0-NEXT: popq %rbx
-; FALLBACK0-NEXT: popq %r12
-; FALLBACK0-NEXT: popq %r13
-; FALLBACK0-NEXT: popq %r14
-; FALLBACK0-NEXT: popq %r15
-; FALLBACK0-NEXT: retq
-;
-; FALLBACK1-LABEL: lshr_64bytes:
-; FALLBACK1: # %bb.0:
-; FALLBACK1-NEXT: pushq %r15
-; FALLBACK1-NEXT: pushq %r14
-; FALLBACK1-NEXT: pushq %rbx
-; FALLBACK1-NEXT: movq (%rdi), %rcx
-; FALLBACK1-NEXT: movq 8(%rdi), %r8
-; FALLBACK1-NEXT: movq 16(%rdi), %r9
-; FALLBACK1-NEXT: movq 24(%rdi), %r10
-; FALLBACK1-NEXT: movq 32(%rdi), %r11
-; FALLBACK1-NEXT: movq 40(%rdi), %rbx
-; FALLBACK1-NEXT: movq 48(%rdi), %r14
-; FALLBACK1-NEXT: movq 56(%rdi), %rdi
-; FALLBACK1-NEXT: movl (%rsi), %eax
-; FALLBACK1-NEXT: xorps %xmm0, %xmm0
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: leal (,%rax,8), %ecx
-; FALLBACK1-NEXT: andl $56, %ecx
-; FALLBACK1-NEXT: andl $56, %eax
-; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi
-; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi
-; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9
-; FALLBACK1-NEXT: movq %r9, %r8
-; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8
-; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10
-; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11
-; FALLBACK1-NEXT: movq %r11, %rbx
-; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx
-; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11
-; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14
-; FALLBACK1-NEXT: movq %r14, %r15
-; FALLBACK1-NEXT: shrdq %cl, %r11, %r15
-; FALLBACK1-NEXT: shrdq %cl, %r14, %r10
-; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK1-NEXT: shrdq %cl, %rax, %r11
-; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi
-; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK1-NEXT: shrq %cl, %rax
-; FALLBACK1-NEXT: movq %r11, 48(%rdx)
-; FALLBACK1-NEXT: movq %rax, 56(%rdx)
-; FALLBACK1-NEXT: movq %r10, 32(%rdx)
-; FALLBACK1-NEXT: movq %r15, 40(%rdx)
-; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK1-NEXT: movq %rbx, 24(%rdx)
-; FALLBACK1-NEXT: movq %rsi, (%rdx)
-; FALLBACK1-NEXT: movq %r8, 8(%rdx)
-; FALLBACK1-NEXT: popq %rbx
-; FALLBACK1-NEXT: popq %r14
-; FALLBACK1-NEXT: popq %r15
-; FALLBACK1-NEXT: retq
-;
-; FALLBACK2-LABEL: lshr_64bytes:
-; FALLBACK2: # %bb.0:
-; FALLBACK2-NEXT: pushq %rbp
-; FALLBACK2-NEXT: pushq %r15
-; FALLBACK2-NEXT: pushq %r14
-; FALLBACK2-NEXT: pushq %r13
-; FALLBACK2-NEXT: pushq %r12
-; FALLBACK2-NEXT: pushq %rbx
-; FALLBACK2-NEXT: pushq %rax
-; FALLBACK2-NEXT: movq (%rdi), %rcx
-; FALLBACK2-NEXT: movq 8(%rdi), %r8
-; FALLBACK2-NEXT: movq 16(%rdi), %r9
-; FALLBACK2-NEXT: movq 24(%rdi), %r10
-; FALLBACK2-NEXT: movq 32(%rdi), %r11
-; FALLBACK2-NEXT: movq 40(%rdi), %rbx
-; FALLBACK2-NEXT: movq 48(%rdi), %r14
-; FALLBACK2-NEXT: movq 56(%rdi), %rdi
-; FALLBACK2-NEXT: movl (%rsi), %eax
-; FALLBACK2-NEXT: xorps %xmm0, %xmm0
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: leal (,%rax,8), %ecx
-; FALLBACK2-NEXT: andl $56, %ecx
-; FALLBACK2-NEXT: andl $56, %eax
-; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi
-; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9
-; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx
-; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13
-; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi
-; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8
-; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10
-; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11
-; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14
-; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15
-; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp
-; FALLBACK2-NEXT: movl %ecx, %r12d
-; FALLBACK2-NEXT: notb %r12b
-; FALLBACK2-NEXT: addq %r9, %r9
-; FALLBACK2-NEXT: shlxq %r12, %r9, %r9
-; FALLBACK2-NEXT: orq %rbx, %r9
-; FALLBACK2-NEXT: addq %rdi, %rdi
-; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi
-; FALLBACK2-NEXT: orq %r13, %rdi
-; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx
-; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13
-; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx
-; FALLBACK2-NEXT: addq %r10, %r10
-; FALLBACK2-NEXT: shlxq %r12, %r10, %r10
-; FALLBACK2-NEXT: orq %r8, %r10
-; FALLBACK2-NEXT: addq %rsi, %rsi
-; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi
-; FALLBACK2-NEXT: orq %r11, %rsi
-; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8
-; FALLBACK2-NEXT: shlxq %r12, %r8, %r8
-; FALLBACK2-NEXT: orq %r15, %r8
-; FALLBACK2-NEXT: addq %r14, %r14
-; FALLBACK2-NEXT: shlxq %r12, %r14, %r11
-; FALLBACK2-NEXT: orq %rbp, %r11
-; FALLBACK2-NEXT: addq %rax, %rax
-; FALLBACK2-NEXT: shlxq %r12, %rax, %rax
-; FALLBACK2-NEXT: orq %r13, %rax
-; FALLBACK2-NEXT: movq %rcx, 56(%rdx)
-; FALLBACK2-NEXT: movq %rax, 48(%rdx)
-; FALLBACK2-NEXT: movq %r11, 32(%rdx)
-; FALLBACK2-NEXT: movq %r8, 40(%rdx)
-; FALLBACK2-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK2-NEXT: movq %r10, 24(%rdx)
-; FALLBACK2-NEXT: movq %rdi, (%rdx)
-; FALLBACK2-NEXT: movq %r9, 8(%rdx)
-; FALLBACK2-NEXT: addq $8, %rsp
-; FALLBACK2-NEXT: popq %rbx
-; FALLBACK2-NEXT: popq %r12
-; FALLBACK2-NEXT: popq %r13
-; FALLBACK2-NEXT: popq %r14
-; FALLBACK2-NEXT: popq %r15
-; FALLBACK2-NEXT: popq %rbp
-; FALLBACK2-NEXT: retq
-;
-; FALLBACK3-LABEL: lshr_64bytes:
-; FALLBACK3: # %bb.0:
-; FALLBACK3-NEXT: pushq %r15
-; FALLBACK3-NEXT: pushq %r14
-; FALLBACK3-NEXT: pushq %rbx
-; FALLBACK3-NEXT: movq (%rdi), %rcx
-; FALLBACK3-NEXT: movq 8(%rdi), %r8
-; FALLBACK3-NEXT: movq 16(%rdi), %r9
-; FALLBACK3-NEXT: movq 24(%rdi), %r10
-; FALLBACK3-NEXT: movq 32(%rdi), %r11
-; FALLBACK3-NEXT: movq 40(%rdi), %rbx
-; FALLBACK3-NEXT: movq 48(%rdi), %r14
-; FALLBACK3-NEXT: movq 56(%rdi), %rdi
-; FALLBACK3-NEXT: movl (%rsi), %eax
-; FALLBACK3-NEXT: xorps %xmm0, %xmm0
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: leal (,%rax,8), %ecx
-; FALLBACK3-NEXT: andl $56, %ecx
-; FALLBACK3-NEXT: andl $56, %eax
-; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi
-; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi
-; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9
-; FALLBACK3-NEXT: movq %r9, %r8
-; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8
-; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10
-; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11
-; FALLBACK3-NEXT: movq %r11, %rbx
-; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx
-; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11
-; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14
-; FALLBACK3-NEXT: movq %r14, %r15
-; FALLBACK3-NEXT: shrdq %cl, %r11, %r15
-; FALLBACK3-NEXT: shrdq %cl, %r14, %r10
-; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK3-NEXT: shrdq %cl, %rax, %r11
-; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax
-; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi
-; FALLBACK3-NEXT: movq %r11, 48(%rdx)
-; FALLBACK3-NEXT: movq %r10, 32(%rdx)
-; FALLBACK3-NEXT: movq %r15, 40(%rdx)
-; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK3-NEXT: movq %rbx, 24(%rdx)
-; FALLBACK3-NEXT: movq %rsi, (%rdx)
-; FALLBACK3-NEXT: movq %r8, 8(%rdx)
-; FALLBACK3-NEXT: movq %rax, 56(%rdx)
-; FALLBACK3-NEXT: popq %rbx
-; FALLBACK3-NEXT: popq %r14
-; FALLBACK3-NEXT: popq %r15
-; FALLBACK3-NEXT: retq
-;
-; FALLBACK4-LABEL: lshr_64bytes:
-; FALLBACK4: # %bb.0:
-; FALLBACK4-NEXT: pushq %rbp
-; FALLBACK4-NEXT: pushq %r15
-; FALLBACK4-NEXT: pushq %r14
-; FALLBACK4-NEXT: pushq %r13
-; FALLBACK4-NEXT: pushq %r12
-; FALLBACK4-NEXT: pushq %rbx
-; FALLBACK4-NEXT: pushq %rax
-; FALLBACK4-NEXT: movups (%rdi), %xmm0
-; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK4-NEXT: movups 48(%rdi), %xmm3
-; FALLBACK4-NEXT: movl (%rsi), %r8d
-; FALLBACK4-NEXT: xorps %xmm4, %xmm4
-; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: leal (,%r8,8), %eax
-; FALLBACK4-NEXT: andl $56, %eax
-; FALLBACK4-NEXT: andl $56, %r8d
-; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10
-; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r10
-; FALLBACK4-NEXT: movl %eax, %esi
-; FALLBACK4-NEXT: notb %sil
-; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rdi
-; FALLBACK4-NEXT: orq %r10, %rdi
-; FALLBACK4-NEXT: movq -104(%rsp,%r8), %r10
-; FALLBACK4-NEXT: movq %r10, %rbx
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %rbx
-; FALLBACK4-NEXT: movq -96(%rsp,%r8), %r12
-; FALLBACK4-NEXT: leaq (%r12,%r12), %r11
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r11
-; FALLBACK4-NEXT: orq %rbx, %r11
-; FALLBACK4-NEXT: movq -112(%rsp,%r8), %rbx
-; FALLBACK4-NEXT: movq %rbx, %r14
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r14
-; FALLBACK4-NEXT: addq %r10, %r10
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r10
-; FALLBACK4-NEXT: orq %r14, %r10
-; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r14
-; FALLBACK4-NEXT: movq %r14, %r13
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r13
-; FALLBACK4-NEXT: movq -80(%rsp,%r8), %rbp
-; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r15
-; FALLBACK4-NEXT: orq %r13, %r15
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r12
-; FALLBACK4-NEXT: addq %r14, %r14
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r14
-; FALLBACK4-NEXT: orq %r12, %r14
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %rbp
-; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r8
-; FALLBACK4-NEXT: leaq (%r8,%r8), %r12
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r12
-; FALLBACK4-NEXT: orq %rbp, %r12
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r9
-; FALLBACK4-NEXT: addq %rbx, %rbx
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rbx
-; FALLBACK4-NEXT: orq %r9, %rbx
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r8
-; FALLBACK4-NEXT: movq %r8, 56(%rdx)
-; FALLBACK4-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK4-NEXT: movq %r12, 48(%rdx)
-; FALLBACK4-NEXT: movq %r14, 32(%rdx)
-; FALLBACK4-NEXT: movq %r15, 40(%rdx)
-; FALLBACK4-NEXT: movq %r10, 16(%rdx)
-; FALLBACK4-NEXT: movq %r11, 24(%rdx)
-; FALLBACK4-NEXT: movq %rdi, (%rdx)
-; FALLBACK4-NEXT: addq $8, %rsp
-; FALLBACK4-NEXT: popq %rbx
-; FALLBACK4-NEXT: popq %r12
-; FALLBACK4-NEXT: popq %r13
-; FALLBACK4-NEXT: popq %r14
-; FALLBACK4-NEXT: popq %r15
-; FALLBACK4-NEXT: popq %rbp
-; FALLBACK4-NEXT: retq
-;
-; FALLBACK5-LABEL: lshr_64bytes:
-; FALLBACK5: # %bb.0:
-; FALLBACK5-NEXT: pushq %r15
-; FALLBACK5-NEXT: pushq %r14
-; FALLBACK5-NEXT: pushq %rbx
-; FALLBACK5-NEXT: movups (%rdi), %xmm0
-; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK5-NEXT: movups 48(%rdi), %xmm3
-; FALLBACK5-NEXT: movl (%rsi), %eax
-; FALLBACK5-NEXT: xorps %xmm4, %xmm4
-; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: leal (,%rax,8), %ecx
-; FALLBACK5-NEXT: andl $56, %ecx
-; FALLBACK5-NEXT: andl $56, %eax
-; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK5-NEXT: movq %r9, %rsi
-; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK5-NEXT: movq %r10, %r8
-; FALLBACK5-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK5-NEXT: movq %r11, %rbx
-; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK5-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK5-NEXT: movq %rax, %r15
-; FALLBACK5-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK5-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK5-NEXT: shrq %cl, %r11
-; FALLBACK5-NEXT: movq %r15, 8(%rdx)
-; FALLBACK5-NEXT: movq %r9, 48(%rdx)
-; FALLBACK5-NEXT: movq %r11, 56(%rdx)
-; FALLBACK5-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK5-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK5-NEXT: movq %r8, 16(%rdx)
-; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK5-NEXT: movq %r14, (%rdx)
-; FALLBACK5-NEXT: popq %rbx
-; FALLBACK5-NEXT: popq %r14
-; FALLBACK5-NEXT: popq %r15
-; FALLBACK5-NEXT: retq
-;
-; FALLBACK6-LABEL: lshr_64bytes:
-; FALLBACK6: # %bb.0:
-; FALLBACK6-NEXT: pushq %rbp
-; FALLBACK6-NEXT: pushq %r15
-; FALLBACK6-NEXT: pushq %r14
-; FALLBACK6-NEXT: pushq %r13
-; FALLBACK6-NEXT: pushq %r12
-; FALLBACK6-NEXT: pushq %rbx
-; FALLBACK6-NEXT: pushq %rax
-; FALLBACK6-NEXT: movups (%rdi), %xmm0
-; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK6-NEXT: movups 48(%rdi), %xmm3
-; FALLBACK6-NEXT: movl (%rsi), %eax
-; FALLBACK6-NEXT: xorps %xmm4, %xmm4
-; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: leal (,%rax,8), %esi
-; FALLBACK6-NEXT: andl $56, %esi
-; FALLBACK6-NEXT: andl $56, %eax
-; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx
-; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi
-; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12
-; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13
-; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9
-; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10
-; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14
-; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15
-; FALLBACK6-NEXT: movl %esi, %ebx
-; FALLBACK6-NEXT: notb %bl
-; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp
-; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8
-; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8
-; FALLBACK6-NEXT: orq %r11, %r8
-; FALLBACK6-NEXT: leaq (%r13,%r13), %r11
-; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11
-; FALLBACK6-NEXT: orq %r12, %r11
-; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12
-; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13
-; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp
-; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi
-; FALLBACK6-NEXT: addq %rdi, %rdi
-; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi
-; FALLBACK6-NEXT: orq %r9, %rdi
-; FALLBACK6-NEXT: leaq (%r12,%r12), %r9
-; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9
-; FALLBACK6-NEXT: orq %r14, %r9
-; FALLBACK6-NEXT: addq %r10, %r10
-; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10
-; FALLBACK6-NEXT: orq %r15, %r10
-; FALLBACK6-NEXT: addq %rax, %rax
-; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax
-; FALLBACK6-NEXT: orq %r13, %rax
-; FALLBACK6-NEXT: addq %rcx, %rcx
-; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx
-; FALLBACK6-NEXT: orq %rbp, %rcx
-; FALLBACK6-NEXT: movq %rsi, 56(%rdx)
-; FALLBACK6-NEXT: movq %rcx, 8(%rdx)
-; FALLBACK6-NEXT: movq %rax, 48(%rdx)
-; FALLBACK6-NEXT: movq %r10, 32(%rdx)
-; FALLBACK6-NEXT: movq %r9, 40(%rdx)
-; FALLBACK6-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK6-NEXT: movq %r11, 24(%rdx)
-; FALLBACK6-NEXT: movq %r8, (%rdx)
-; FALLBACK6-NEXT: addq $8, %rsp
-; FALLBACK6-NEXT: popq %rbx
-; FALLBACK6-NEXT: popq %r12
-; FALLBACK6-NEXT: popq %r13
-; FALLBACK6-NEXT: popq %r14
-; FALLBACK6-NEXT: popq %r15
-; FALLBACK6-NEXT: popq %rbp
-; FALLBACK6-NEXT: retq
-;
-; FALLBACK7-LABEL: lshr_64bytes:
-; FALLBACK7: # %bb.0:
-; FALLBACK7-NEXT: pushq %r15
-; FALLBACK7-NEXT: pushq %r14
-; FALLBACK7-NEXT: pushq %rbx
-; FALLBACK7-NEXT: movups (%rdi), %xmm0
-; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK7-NEXT: movups 48(%rdi), %xmm3
-; FALLBACK7-NEXT: movl (%rsi), %eax
-; FALLBACK7-NEXT: xorps %xmm4, %xmm4
-; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: leal (,%rax,8), %ecx
-; FALLBACK7-NEXT: andl $56, %ecx
-; FALLBACK7-NEXT: andl $56, %eax
-; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK7-NEXT: movq %r9, %rsi
-; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK7-NEXT: movq %r10, %r8
-; FALLBACK7-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK7-NEXT: movq %r11, %rbx
-; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK7-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK7-NEXT: movq %rax, %r15
-; FALLBACK7-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10
-; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK7-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK7-NEXT: movq %r15, 8(%rdx)
-; FALLBACK7-NEXT: movq %r9, 48(%rdx)
-; FALLBACK7-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK7-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK7-NEXT: movq %r8, 16(%rdx)
-; FALLBACK7-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK7-NEXT: movq %r14, (%rdx)
-; FALLBACK7-NEXT: movq %r10, 56(%rdx)
-; FALLBACK7-NEXT: popq %rbx
-; FALLBACK7-NEXT: popq %r14
-; FALLBACK7-NEXT: popq %r15
-; FALLBACK7-NEXT: retq
-;
-; FALLBACK8-LABEL: lshr_64bytes:
-; FALLBACK8: # %bb.0:
-; FALLBACK8-NEXT: pushq %rbp
-; FALLBACK8-NEXT: pushq %r15
-; FALLBACK8-NEXT: pushq %r14
-; FALLBACK8-NEXT: pushq %r13
-; FALLBACK8-NEXT: pushq %r12
-; FALLBACK8-NEXT: pushq %rbx
-; FALLBACK8-NEXT: pushq %rax
-; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1
-; FALLBACK8-NEXT: movl (%rsi), %r9d
-; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: leal (,%r9,8), %eax
-; FALLBACK8-NEXT: andl $56, %eax
-; FALLBACK8-NEXT: andl $56, %r9d
-; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10
-; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r10
-; FALLBACK8-NEXT: movl %eax, %esi
-; FALLBACK8-NEXT: notb %sil
-; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rdi
-; FALLBACK8-NEXT: orq %r10, %rdi
-; FALLBACK8-NEXT: movq -104(%rsp,%r9), %r10
-; FALLBACK8-NEXT: movq %r10, %rbx
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %rbx
-; FALLBACK8-NEXT: movq -96(%rsp,%r9), %r12
-; FALLBACK8-NEXT: leaq (%r12,%r12), %r11
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r11
-; FALLBACK8-NEXT: orq %rbx, %r11
-; FALLBACK8-NEXT: movq -112(%rsp,%r9), %rbx
-; FALLBACK8-NEXT: movq %rbx, %r14
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r14
-; FALLBACK8-NEXT: addq %r10, %r10
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r10
-; FALLBACK8-NEXT: orq %r14, %r10
-; FALLBACK8-NEXT: movq -88(%rsp,%r9), %r14
-; FALLBACK8-NEXT: movq %r14, %r13
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r13
-; FALLBACK8-NEXT: movq -80(%rsp,%r9), %rbp
-; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r15
-; FALLBACK8-NEXT: orq %r13, %r15
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r12
-; FALLBACK8-NEXT: addq %r14, %r14
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r14
-; FALLBACK8-NEXT: orq %r12, %r14
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %rbp
-; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r9
-; FALLBACK8-NEXT: leaq (%r9,%r9), %r12
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r12
-; FALLBACK8-NEXT: orq %rbp, %r12
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r8
-; FALLBACK8-NEXT: addq %rbx, %rbx
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rbx
-; FALLBACK8-NEXT: orq %r8, %rbx
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r9
-; FALLBACK8-NEXT: movq %r9, 56(%rdx)
-; FALLBACK8-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK8-NEXT: movq %r12, 48(%rdx)
-; FALLBACK8-NEXT: movq %r14, 32(%rdx)
-; FALLBACK8-NEXT: movq %r15, 40(%rdx)
-; FALLBACK8-NEXT: movq %r10, 16(%rdx)
-; FALLBACK8-NEXT: movq %r11, 24(%rdx)
-; FALLBACK8-NEXT: movq %rdi, (%rdx)
-; FALLBACK8-NEXT: addq $8, %rsp
-; FALLBACK8-NEXT: popq %rbx
-; FALLBACK8-NEXT: popq %r12
-; FALLBACK8-NEXT: popq %r13
-; FALLBACK8-NEXT: popq %r14
-; FALLBACK8-NEXT: popq %r15
-; FALLBACK8-NEXT: popq %rbp
-; FALLBACK8-NEXT: vzeroupper
-; FALLBACK8-NEXT: retq
-;
-; FALLBACK9-LABEL: lshr_64bytes:
-; FALLBACK9: # %bb.0:
-; FALLBACK9-NEXT: pushq %r15
-; FALLBACK9-NEXT: pushq %r14
-; FALLBACK9-NEXT: pushq %rbx
-; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1
-; FALLBACK9-NEXT: movl (%rsi), %eax
-; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: leal (,%rax,8), %ecx
-; FALLBACK9-NEXT: andl $56, %ecx
-; FALLBACK9-NEXT: andl $56, %eax
-; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK9-NEXT: movq %r9, %rsi
-; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK9-NEXT: movq %r10, %r8
-; FALLBACK9-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK9-NEXT: movq %r11, %rbx
-; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK9-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK9-NEXT: movq %rax, %r15
-; FALLBACK9-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK9-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK9-NEXT: shrq %cl, %r11
-; FALLBACK9-NEXT: movq %r15, 8(%rdx)
-; FALLBACK9-NEXT: movq %r9, 48(%rdx)
-; FALLBACK9-NEXT: movq %r11, 56(%rdx)
-; FALLBACK9-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK9-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK9-NEXT: movq %r8, 16(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK9-NEXT: movq %r14, (%rdx)
-; FALLBACK9-NEXT: popq %rbx
-; FALLBACK9-NEXT: popq %r14
-; FALLBACK9-NEXT: popq %r15
-; FALLBACK9-NEXT: vzeroupper
-; FALLBACK9-NEXT: retq
-;
-; FALLBACK10-LABEL: lshr_64bytes:
-; FALLBACK10: # %bb.0:
-; FALLBACK10-NEXT: pushq %rbp
-; FALLBACK10-NEXT: pushq %r15
-; FALLBACK10-NEXT: pushq %r14
-; FALLBACK10-NEXT: pushq %r13
-; FALLBACK10-NEXT: pushq %r12
-; FALLBACK10-NEXT: pushq %rbx
-; FALLBACK10-NEXT: pushq %rax
-; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1
-; FALLBACK10-NEXT: movl (%rsi), %eax
-; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: leal (,%rax,8), %esi
-; FALLBACK10-NEXT: andl $56, %esi
-; FALLBACK10-NEXT: andl $56, %eax
-; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx
-; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi
-; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12
-; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13
-; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9
-; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10
-; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14
-; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15
-; FALLBACK10-NEXT: movl %esi, %ebx
-; FALLBACK10-NEXT: notb %bl
-; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp
-; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8
-; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8
-; FALLBACK10-NEXT: orq %r11, %r8
-; FALLBACK10-NEXT: leaq (%r13,%r13), %r11
-; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11
-; FALLBACK10-NEXT: orq %r12, %r11
-; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12
-; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13
-; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp
-; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi
-; FALLBACK10-NEXT: addq %rdi, %rdi
-; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi
-; FALLBACK10-NEXT: orq %r9, %rdi
-; FALLBACK10-NEXT: leaq (%r12,%r12), %r9
-; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9
-; FALLBACK10-NEXT: orq %r14, %r9
-; FALLBACK10-NEXT: addq %r10, %r10
-; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10
-; FALLBACK10-NEXT: orq %r15, %r10
-; FALLBACK10-NEXT: addq %rax, %rax
-; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax
-; FALLBACK10-NEXT: orq %r13, %rax
-; FALLBACK10-NEXT: addq %rcx, %rcx
-; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx
-; FALLBACK10-NEXT: orq %rbp, %rcx
-; FALLBACK10-NEXT: movq %rsi, 56(%rdx)
-; FALLBACK10-NEXT: movq %rcx, 8(%rdx)
-; FALLBACK10-NEXT: movq %rax, 48(%rdx)
-; FALLBACK10-NEXT: movq %r10, 32(%rdx)
-; FALLBACK10-NEXT: movq %r9, 40(%rdx)
-; FALLBACK10-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK10-NEXT: movq %r11, 24(%rdx)
-; FALLBACK10-NEXT: movq %r8, (%rdx)
-; FALLBACK10-NEXT: addq $8, %rsp
-; FALLBACK10-NEXT: popq %rbx
-; FALLBACK10-NEXT: popq %r12
-; FALLBACK10-NEXT: popq %r13
-; FALLBACK10-NEXT: popq %r14
-; FALLBACK10-NEXT: popq %r15
-; FALLBACK10-NEXT: popq %rbp
-; FALLBACK10-NEXT: vzeroupper
-; FALLBACK10-NEXT: retq
-;
-; FALLBACK11-LABEL: lshr_64bytes:
-; FALLBACK11: # %bb.0:
-; FALLBACK11-NEXT: pushq %r15
-; FALLBACK11-NEXT: pushq %r14
-; FALLBACK11-NEXT: pushq %rbx
-; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1
-; FALLBACK11-NEXT: movl (%rsi), %eax
-; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: leal (,%rax,8), %ecx
-; FALLBACK11-NEXT: andl $56, %ecx
-; FALLBACK11-NEXT: andl $56, %eax
-; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK11-NEXT: movq %r9, %rsi
-; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK11-NEXT: movq %r10, %r8
-; FALLBACK11-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK11-NEXT: movq %r11, %rbx
-; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK11-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK11-NEXT: movq %rax, %r15
-; FALLBACK11-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10
-; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK11-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK11-NEXT: movq %r15, 8(%rdx)
-; FALLBACK11-NEXT: movq %r9, 48(%rdx)
-; FALLBACK11-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK11-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK11-NEXT: movq %r8, 16(%rdx)
-; FALLBACK11-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK11-NEXT: movq %r14, (%rdx)
-; FALLBACK11-NEXT: movq %r10, 56(%rdx)
-; FALLBACK11-NEXT: popq %rbx
-; FALLBACK11-NEXT: popq %r14
-; FALLBACK11-NEXT: popq %r15
-; FALLBACK11-NEXT: vzeroupper
-; FALLBACK11-NEXT: retq
-;
-; FALLBACK12-LABEL: lshr_64bytes:
-; FALLBACK12: # %bb.0:
-; FALLBACK12-NEXT: pushq %rbp
-; FALLBACK12-NEXT: pushq %r15
-; FALLBACK12-NEXT: pushq %r14
-; FALLBACK12-NEXT: pushq %r13
-; FALLBACK12-NEXT: pushq %r12
-; FALLBACK12-NEXT: pushq %rbx
-; FALLBACK12-NEXT: pushq %rax
-; FALLBACK12-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK12-NEXT: movl (%rsi), %r9d
-; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: leal (,%r9,8), %eax
-; FALLBACK12-NEXT: andl $56, %eax
-; FALLBACK12-NEXT: andl $56, %r9d
-; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10
-; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: movl %eax, %esi
-; FALLBACK12-NEXT: notb %sil
-; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rdi
-; FALLBACK12-NEXT: orq %r10, %rdi
-; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10
-; FALLBACK12-NEXT: movq %r10, %rbx
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %rbx
-; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12
-; FALLBACK12-NEXT: leaq (%r12,%r12), %r11
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r11
-; FALLBACK12-NEXT: orq %rbx, %r11
-; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx
-; FALLBACK12-NEXT: movq %rbx, %r14
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r14
-; FALLBACK12-NEXT: addq %r10, %r10
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r10
-; FALLBACK12-NEXT: orq %r14, %r10
-; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14
-; FALLBACK12-NEXT: movq %r14, %r13
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r13
-; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp
-; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r15
-; FALLBACK12-NEXT: orq %r13, %r15
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r12
-; FALLBACK12-NEXT: addq %r14, %r14
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r14
-; FALLBACK12-NEXT: orq %r12, %r14
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %rbp
-; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9
-; FALLBACK12-NEXT: leaq (%r9,%r9), %r12
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r12
-; FALLBACK12-NEXT: orq %rbp, %r12
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r8
-; FALLBACK12-NEXT: addq %rbx, %rbx
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rbx
-; FALLBACK12-NEXT: orq %r8, %rbx
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r9
-; FALLBACK12-NEXT: movq %r9, 56(%rdx)
-; FALLBACK12-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK12-NEXT: movq %r12, 48(%rdx)
-; FALLBACK12-NEXT: movq %r14, 32(%rdx)
-; FALLBACK12-NEXT: movq %r15, 40(%rdx)
-; FALLBACK12-NEXT: movq %r10, 16(%rdx)
-; FALLBACK12-NEXT: movq %r11, 24(%rdx)
-; FALLBACK12-NEXT: movq %rdi, (%rdx)
-; FALLBACK12-NEXT: addq $8, %rsp
-; FALLBACK12-NEXT: popq %rbx
-; FALLBACK12-NEXT: popq %r12
-; FALLBACK12-NEXT: popq %r13
-; FALLBACK12-NEXT: popq %r14
-; FALLBACK12-NEXT: popq %r15
-; FALLBACK12-NEXT: popq %rbp
-; FALLBACK12-NEXT: vzeroupper
-; FALLBACK12-NEXT: retq
-;
-; FALLBACK13-LABEL: lshr_64bytes:
-; FALLBACK13: # %bb.0:
-; FALLBACK13-NEXT: pushq %r15
-; FALLBACK13-NEXT: pushq %r14
-; FALLBACK13-NEXT: pushq %rbx
-; FALLBACK13-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK13-NEXT: movl (%rsi), %edi
-; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: leal (,%rdi,8), %ecx
-; FALLBACK13-NEXT: andl $56, %ecx
-; FALLBACK13-NEXT: andl $56, %edi
-; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi
-; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9
-; FALLBACK13-NEXT: movq %r9, %rax
-; FALLBACK13-NEXT: shrdq %cl, %rsi, %rax
-; FALLBACK13-NEXT: movq -112(%rsp,%rdi), %r10
-; FALLBACK13-NEXT: movq %r10, %r8
-; FALLBACK13-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK13-NEXT: movq -80(%rsp,%rdi), %r9
-; FALLBACK13-NEXT: movq -88(%rsp,%rdi), %r11
-; FALLBACK13-NEXT: movq %r11, %rbx
-; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK13-NEXT: shrdq %cl, %r11, %rsi
-; FALLBACK13-NEXT: movq -72(%rsp,%rdi), %r11
-; FALLBACK13-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK13-NEXT: movq -128(%rsp,%rdi), %r14
-; FALLBACK13-NEXT: movq -120(%rsp,%rdi), %rdi
-; FALLBACK13-NEXT: movq %rdi, %r15
-; FALLBACK13-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK13-NEXT: shrdq %cl, %rdi, %r14
-; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK13-NEXT: shrq %cl, %r11
-; FALLBACK13-NEXT: movq %r15, 8(%rdx)
-; FALLBACK13-NEXT: movq %r9, 48(%rdx)
-; FALLBACK13-NEXT: movq %r11, 56(%rdx)
-; FALLBACK13-NEXT: movq %rsi, 32(%rdx)
-; FALLBACK13-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK13-NEXT: movq %r8, 16(%rdx)
-; FALLBACK13-NEXT: movq %rax, 24(%rdx)
-; FALLBACK13-NEXT: movq %r14, (%rdx)
-; FALLBACK13-NEXT: popq %rbx
-; FALLBACK13-NEXT: popq %r14
-; FALLBACK13-NEXT: popq %r15
-; FALLBACK13-NEXT: vzeroupper
-; FALLBACK13-NEXT: retq
-;
-; FALLBACK14-LABEL: lshr_64bytes:
-; FALLBACK14: # %bb.0:
-; FALLBACK14-NEXT: pushq %rbp
-; FALLBACK14-NEXT: pushq %r15
-; FALLBACK14-NEXT: pushq %r14
-; FALLBACK14-NEXT: pushq %r13
-; FALLBACK14-NEXT: pushq %r12
-; FALLBACK14-NEXT: pushq %rbx
-; FALLBACK14-NEXT: pushq %rax
-; FALLBACK14-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK14-NEXT: movl (%rsi), %esi
-; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK14-NEXT: andl $56, %ecx
-; FALLBACK14-NEXT: andl $56, %esi
-; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11
-; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax
-; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi
-; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12
-; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13
-; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9
-; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10
-; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14
-; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15
-; FALLBACK14-NEXT: movl %ecx, %ebx
-; FALLBACK14-NEXT: notb %bl
-; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp
-; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8
-; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8
-; FALLBACK14-NEXT: orq %r11, %r8
-; FALLBACK14-NEXT: leaq (%r13,%r13), %r11
-; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11
-; FALLBACK14-NEXT: orq %r12, %r11
-; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12
-; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13
-; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp
-; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi
-; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx
-; FALLBACK14-NEXT: addq %rdi, %rdi
-; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi
-; FALLBACK14-NEXT: orq %r9, %rdi
-; FALLBACK14-NEXT: leaq (%r12,%r12), %r9
-; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9
-; FALLBACK14-NEXT: orq %r14, %r9
-; FALLBACK14-NEXT: addq %r10, %r10
-; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10
-; FALLBACK14-NEXT: orq %r15, %r10
-; FALLBACK14-NEXT: addq %rsi, %rsi
-; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi
-; FALLBACK14-NEXT: orq %r13, %rsi
-; FALLBACK14-NEXT: addq %rax, %rax
-; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax
-; FALLBACK14-NEXT: orq %rbp, %rax
-; FALLBACK14-NEXT: movq %rcx, 56(%rdx)
-; FALLBACK14-NEXT: movq %rax, 8(%rdx)
-; FALLBACK14-NEXT: movq %rsi, 48(%rdx)
-; FALLBACK14-NEXT: movq %r10, 32(%rdx)
-; FALLBACK14-NEXT: movq %r9, 40(%rdx)
-; FALLBACK14-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK14-NEXT: movq %r11, 24(%rdx)
-; FALLBACK14-NEXT: movq %r8, (%rdx)
-; FALLBACK14-NEXT: addq $8, %rsp
-; FALLBACK14-NEXT: popq %rbx
-; FALLBACK14-NEXT: popq %r12
-; FALLBACK14-NEXT: popq %r13
-; FALLBACK14-NEXT: popq %r14
-; FALLBACK14-NEXT: popq %r15
-; FALLBACK14-NEXT: popq %rbp
-; FALLBACK14-NEXT: vzeroupper
-; FALLBACK14-NEXT: retq
-;
-; FALLBACK15-LABEL: lshr_64bytes:
-; FALLBACK15: # %bb.0:
-; FALLBACK15-NEXT: pushq %r15
-; FALLBACK15-NEXT: pushq %r14
-; FALLBACK15-NEXT: pushq %rbx
-; FALLBACK15-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK15-NEXT: movl (%rsi), %eax
-; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: leal (,%rax,8), %ecx
-; FALLBACK15-NEXT: andl $56, %ecx
-; FALLBACK15-NEXT: andl $56, %eax
-; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK15-NEXT: movq %r9, %rsi
-; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK15-NEXT: movq %r10, %r8
-; FALLBACK15-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK15-NEXT: movq %r11, %rbx
-; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK15-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK15-NEXT: movq %rax, %r15
-; FALLBACK15-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10
-; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK15-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK15-NEXT: movq %r15, 8(%rdx)
-; FALLBACK15-NEXT: movq %r9, 48(%rdx)
-; FALLBACK15-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK15-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK15-NEXT: movq %r8, 16(%rdx)
-; FALLBACK15-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK15-NEXT: movq %r14, (%rdx)
-; FALLBACK15-NEXT: movq %r10, 56(%rdx)
-; FALLBACK15-NEXT: popq %rbx
-; FALLBACK15-NEXT: popq %r14
-; FALLBACK15-NEXT: popq %r15
-; FALLBACK15-NEXT: vzeroupper
-; FALLBACK15-NEXT: retq
-;
-; FALLBACK16-LABEL: lshr_64bytes:
-; FALLBACK16: # %bb.0:
-; FALLBACK16-NEXT: pushl %ebp
-; FALLBACK16-NEXT: pushl %ebx
-; FALLBACK16-NEXT: pushl %edi
-; FALLBACK16-NEXT: pushl %esi
-; FALLBACK16-NEXT: subl $204, %esp
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl (%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 4(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 8(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 12(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 16(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 20(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 24(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 28(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 32(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 36(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 40(%eax), %ebp
-; FALLBACK16-NEXT: movl 44(%eax), %ebx
-; FALLBACK16-NEXT: movl 48(%eax), %edi
-; FALLBACK16-NEXT: movl 52(%eax), %esi
-; FALLBACK16-NEXT: movl 56(%eax), %edx
-; FALLBACK16-NEXT: movl 60(%eax), %ecx
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl (%eax), %eax
-; FALLBACK16-NEXT: xorps %xmm0, %xmm0
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %eax, %esi
-; FALLBACK16-NEXT: andl $60, %esi
-; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx
-; FALLBACK16-NEXT: shll $3, %eax
-; FALLBACK16-NEXT: andl $24, %eax
-; FALLBACK16-NEXT: movl %edx, %edi
-; FALLBACK16-NEXT: movl %eax, %ecx
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK16-NEXT: movb %al, %ch
-; FALLBACK16-NEXT: notb %ch
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %edi, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: addl %edx, %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: orl %edi, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx
-; FALLBACK16-NEXT: movl %edx, %ebp
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi
-; FALLBACK16-NEXT: leal (%edi,%edi), %ebx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %ebp, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: addl %edx, %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: orl %ebx, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx
-; FALLBACK16-NEXT: movl %ebx, %ebp
-; FALLBACK16-NEXT: movl %eax, %edx
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: addl %eax, %eax
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: orl %ebp, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: addl %ebx, %ebx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %edi, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx
-; FALLBACK16-NEXT: movl %ebx, %ebp
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi
-; FALLBACK16-NEXT: leal (%edi,%edi), %eax
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: orl %ebp, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: addl %ebx, %ebx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %eax, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx
-; FALLBACK16-NEXT: movl %ebx, %ebp
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx
-; FALLBACK16-NEXT: leal (%edx,%edx), %eax
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: orl %ebp, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: addl %ebx, %ebx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %edi, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi
-; FALLBACK16-NEXT: movl %edi, %ebp
-; FALLBACK16-NEXT: movl %eax, %ecx
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %ebp, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shrl %cl, %edx
-; FALLBACK16-NEXT: addl %edi, %edi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: orl %edx, %edi
-; FALLBACK16-NEXT: movl %esi, %edx
-; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi
-; FALLBACK16-NEXT: movl %esi, %ebx
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax
-; FALLBACK16-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %ebx, %ebp
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: addl %esi, %esi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: orl %ebx, %esi
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx
-; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: orl %eax, %edx
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK16-NEXT: shrl %cl, %ebx
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl %ebx, 60(%eax)
-; FALLBACK16-NEXT: movl %edx, 56(%eax)
-; FALLBACK16-NEXT: movl %esi, 48(%eax)
-; FALLBACK16-NEXT: movl %ebp, 52(%eax)
-; FALLBACK16-NEXT: movl %edi, 40(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 44(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 32(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 36(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 24(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 28(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 16(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 20(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 8(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 12(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, (%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 4(%eax)
-; FALLBACK16-NEXT: addl $204, %esp
-; FALLBACK16-NEXT: popl %esi
-; FALLBACK16-NEXT: popl %edi
-; FALLBACK16-NEXT: popl %ebx
-; FALLBACK16-NEXT: popl %ebp
-; FALLBACK16-NEXT: retl
-;
-; FALLBACK17-LABEL: lshr_64bytes:
-; FALLBACK17: # %bb.0:
-; FALLBACK17-NEXT: pushl %ebp
-; FALLBACK17-NEXT: pushl %ebx
-; FALLBACK17-NEXT: pushl %edi
-; FALLBACK17-NEXT: pushl %esi
-; FALLBACK17-NEXT: subl $188, %esp
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl (%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 4(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 8(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 12(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 16(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 20(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 24(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 28(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 32(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 36(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: movl 40(%ecx), %ebp
-; FALLBACK17-NEXT: movl 44(%ecx), %ebx
-; FALLBACK17-NEXT: movl 48(%ecx), %edi
-; FALLBACK17-NEXT: movl 52(%ecx), %esi
-; FALLBACK17-NEXT: movl 56(%ecx), %edx
-; FALLBACK17-NEXT: movl 60(%ecx), %eax
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl (%ecx), %ecx
-; FALLBACK17-NEXT: xorps %xmm0, %xmm0
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ecx, %ebp
-; FALLBACK17-NEXT: andl $60, %ebp
-; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shll $3, %ecx
-; FALLBACK17-NEXT: andl $24, %ecx
-; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %esi
-; FALLBACK17-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edx
-; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edx
-; FALLBACK17-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi
-; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edx
-; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl %esi, %edx
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edi
-; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx
-; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edi
-; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx
-; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK17-NEXT: movl %edx, 56(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK17-NEXT: shrl %cl, %eax
-; FALLBACK17-NEXT: movl %eax, 60(%ebp)
-; FALLBACK17-NEXT: movl %esi, 48(%ebp)
-; FALLBACK17-NEXT: movl %edi, 52(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 40(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 44(%ebp)
-; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 32(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 36(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 24(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 28(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 16(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 20(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 8(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 12(%ebp)
-; FALLBACK17-NEXT: movl %ebx, (%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 4(%ebp)
-; FALLBACK17-NEXT: addl $188, %esp
-; FALLBACK17-NEXT: popl %esi
-; FALLBACK17-NEXT: popl %edi
-; FALLBACK17-NEXT: popl %ebx
-; FALLBACK17-NEXT: popl %ebp
-; FALLBACK17-NEXT: retl
-;
-; FALLBACK18-LABEL: lshr_64bytes:
-; FALLBACK18: # %bb.0:
-; FALLBACK18-NEXT: pushl %ebp
-; FALLBACK18-NEXT: pushl %ebx
-; FALLBACK18-NEXT: pushl %edi
-; FALLBACK18-NEXT: pushl %esi
-; FALLBACK18-NEXT: subl $204, %esp
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl (%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 4(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 8(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 12(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 16(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 20(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 24(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 28(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 32(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 36(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 40(%eax), %ebp
-; FALLBACK18-NEXT: movl 44(%eax), %ebx
-; FALLBACK18-NEXT: movl 48(%eax), %edi
-; FALLBACK18-NEXT: movl 52(%eax), %esi
-; FALLBACK18-NEXT: movl 56(%eax), %edx
-; FALLBACK18-NEXT: movl 60(%eax), %ecx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl (%eax), %eax
-; FALLBACK18-NEXT: xorps %xmm0, %xmm0
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %eax, %ecx
-; FALLBACK18-NEXT: leal (,%eax,8), %edx
-; FALLBACK18-NEXT: andl $24, %edx
-; FALLBACK18-NEXT: andl $60, %ecx
-; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi
-; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, %esi, %edi
-; FALLBACK18-NEXT: movl %edx, %ebx
-; FALLBACK18-NEXT: notb %bl
-; FALLBACK18-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax
-; FALLBACK18-NEXT: orl %edi, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
-; FALLBACK18-NEXT: addl %esi, %esi
-; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK18-NEXT: orl %edi, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi
-; FALLBACK18-NEXT: leal (%esi,%esi), %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi
-; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK18-NEXT: orl %eax, %edi
-; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: leal (%eax,%eax), %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi
-; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: orl %esi, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi
-; FALLBACK18-NEXT: leal (%esi,%esi), %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi
-; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK18-NEXT: orl %eax, %edi
-; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: leal (%eax,%eax), %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi
-; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: orl %esi, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: leal (%eax,%eax), %esi
-; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi
-; FALLBACK18-NEXT: movl %ecx, %edi
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; FALLBACK18-NEXT: addl %esi, %esi
-; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK18-NEXT: orl %ecx, %esi
-; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp
-; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax
-; FALLBACK18-NEXT: shrxl %edx, %eax, %edi
-; FALLBACK18-NEXT: orl %edi, %ecx
-; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: addl %eax, %eax
-; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi
-; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp
-; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx
-; FALLBACK18-NEXT: addl %ebp, %ebp
-; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx
-; FALLBACK18-NEXT: orl %eax, %ebx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl %edx, 60(%eax)
-; FALLBACK18-NEXT: movl %ebx, 56(%eax)
-; FALLBACK18-NEXT: movl %edi, 48(%eax)
-; FALLBACK18-NEXT: movl %ecx, 52(%eax)
-; FALLBACK18-NEXT: movl %esi, 40(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 44(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 32(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 36(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 24(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 28(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 16(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 20(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 8(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 12(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, (%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 4(%eax)
-; FALLBACK18-NEXT: addl $204, %esp
-; FALLBACK18-NEXT: popl %esi
-; FALLBACK18-NEXT: popl %edi
-; FALLBACK18-NEXT: popl %ebx
-; FALLBACK18-NEXT: popl %ebp
-; FALLBACK18-NEXT: retl
-;
-; FALLBACK19-LABEL: lshr_64bytes:
-; FALLBACK19: # %bb.0:
-; FALLBACK19-NEXT: pushl %ebp
-; FALLBACK19-NEXT: pushl %ebx
-; FALLBACK19-NEXT: pushl %edi
-; FALLBACK19-NEXT: pushl %esi
-; FALLBACK19-NEXT: subl $188, %esp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK19-NEXT: movl (%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 4(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 8(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 12(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 16(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 20(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 24(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 28(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 32(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 36(%ecx), %eax
-; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: movl 40(%ecx), %ebp
-; FALLBACK19-NEXT: movl 44(%ecx), %ebx
-; FALLBACK19-NEXT: movl 48(%ecx), %edi
-; FALLBACK19-NEXT: movl 52(%ecx), %esi
-; FALLBACK19-NEXT: movl 56(%ecx), %edx
-; FALLBACK19-NEXT: movl 60(%ecx), %eax
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK19-NEXT: movl (%ecx), %ecx
-; FALLBACK19-NEXT: xorps %xmm0, %xmm0
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ecx, %ebp
-; FALLBACK19-NEXT: andl $60, %ebp
-; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shll $3, %ecx
-; FALLBACK19-NEXT: andl $24, %ecx
-; FALLBACK19-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %esi
-; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx
-; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi
-; FALLBACK19-NEXT: movl %edi, %edx
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi
-; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp
-; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK19-NEXT: movl %eax, 56(%ebp)
-; FALLBACK19-NEXT: movl %esi, 48(%ebp)
-; FALLBACK19-NEXT: movl %edx, 52(%ebp)
-; FALLBACK19-NEXT: movl %ebx, 40(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 44(%ebp)
-; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 32(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 36(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 24(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 28(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 16(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 20(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 8(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 12(%ebp)
-; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK19-NEXT: movl %edi, (%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 4(%ebp)
-; FALLBACK19-NEXT: movl %eax, 60(%ebp)
-; FALLBACK19-NEXT: addl $188, %esp
-; FALLBACK19-NEXT: popl %esi
-; FALLBACK19-NEXT: popl %edi
-; FALLBACK19-NEXT: popl %ebx
-; FALLBACK19-NEXT: popl %ebp
-; FALLBACK19-NEXT: retl
-;
-; FALLBACK20-LABEL: lshr_64bytes:
-; FALLBACK20: # %bb.0:
-; FALLBACK20-NEXT: pushl %ebp
-; FALLBACK20-NEXT: pushl %ebx
-; FALLBACK20-NEXT: pushl %edi
-; FALLBACK20-NEXT: pushl %esi
-; FALLBACK20-NEXT: subl $204, %esp
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: movups (%ecx), %xmm0
-; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK20-NEXT: movups 48(%ecx), %xmm3
-; FALLBACK20-NEXT: movl (%eax), %eax
-; FALLBACK20-NEXT: xorps %xmm4, %xmm4
-; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %eax, %esi
-; FALLBACK20-NEXT: andl $60, %esi
-; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx
-; FALLBACK20-NEXT: shll $3, %eax
-; FALLBACK20-NEXT: andl $24, %eax
-; FALLBACK20-NEXT: movl %edx, %edi
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx
-; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT: movb %al, %ch
-; FALLBACK20-NEXT: notb %ch
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %edi, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: addl %edx, %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %edx
-; FALLBACK20-NEXT: orl %edi, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx
-; FALLBACK20-NEXT: movl %edx, %ebp
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi
-; FALLBACK20-NEXT: leal (%edi,%edi), %ebx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %ebp, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: addl %edx, %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %edx
-; FALLBACK20-NEXT: orl %ebx, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %ebp
-; FALLBACK20-NEXT: movl %eax, %edx
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: addl %eax, %eax
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %eax
-; FALLBACK20-NEXT: orl %ebp, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: addl %ebx, %ebx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %edi, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %ebp
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi
-; FALLBACK20-NEXT: leal (%edi,%edi), %eax
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %eax
-; FALLBACK20-NEXT: orl %ebp, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: addl %ebx, %ebx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %eax, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %ebp
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx
-; FALLBACK20-NEXT: leal (%edx,%edx), %eax
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %eax
-; FALLBACK20-NEXT: orl %ebp, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: addl %ebx, %ebx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %edi, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi
-; FALLBACK20-NEXT: movl %edi, %ebp
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx
-; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %ebp, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %edx
-; FALLBACK20-NEXT: addl %edi, %edi
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %edi
-; FALLBACK20-NEXT: orl %edx, %edi
-; FALLBACK20-NEXT: movl %esi, %edx
-; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi
-; FALLBACK20-NEXT: movl %esi, %ebx
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax
-; FALLBACK20-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebp
-; FALLBACK20-NEXT: orl %ebx, %ebp
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: addl %esi, %esi
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: orl %ebx, %esi
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx
-; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %edx
-; FALLBACK20-NEXT: orl %eax, %edx
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl %ebx, 60(%eax)
-; FALLBACK20-NEXT: movl %edx, 56(%eax)
-; FALLBACK20-NEXT: movl %esi, 48(%eax)
-; FALLBACK20-NEXT: movl %ebp, 52(%eax)
-; FALLBACK20-NEXT: movl %edi, 40(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 44(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 32(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 36(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 24(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 28(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 16(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 20(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 8(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 12(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, (%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 4(%eax)
-; FALLBACK20-NEXT: addl $204, %esp
-; FALLBACK20-NEXT: popl %esi
-; FALLBACK20-NEXT: popl %edi
-; FALLBACK20-NEXT: popl %ebx
-; FALLBACK20-NEXT: popl %ebp
-; FALLBACK20-NEXT: retl
-;
-; FALLBACK21-LABEL: lshr_64bytes:
-; FALLBACK21: # %bb.0:
-; FALLBACK21-NEXT: pushl %ebp
-; FALLBACK21-NEXT: pushl %ebx
-; FALLBACK21-NEXT: pushl %edi
-; FALLBACK21-NEXT: pushl %esi
-; FALLBACK21-NEXT: subl $188, %esp
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK21-NEXT: movups (%ecx), %xmm0
-; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK21-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK21-NEXT: movups 48(%ecx), %xmm3
-; FALLBACK21-NEXT: movl (%eax), %ecx
-; FALLBACK21-NEXT: xorps %xmm4, %xmm4
-; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %ecx, %ebp
-; FALLBACK21-NEXT: andl $60, %ebp
-; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shll $3, %ecx
-; FALLBACK21-NEXT: andl $24, %ecx
-; FALLBACK21-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %esi
-; FALLBACK21-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi
-; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl %esi, %edx
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edi
-; FALLBACK21-NEXT: shrdl %cl, %esi, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edi
-; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx
-; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK21-NEXT: movl %edx, 56(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK21-NEXT: shrl %cl, %eax
-; FALLBACK21-NEXT: movl %eax, 60(%ebp)
-; FALLBACK21-NEXT: movl %esi, 48(%ebp)
-; FALLBACK21-NEXT: movl %edi, 52(%ebp)
-; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 40(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 44(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 32(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 36(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 24(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 28(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 16(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 20(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 8(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 12(%ebp)
-; FALLBACK21-NEXT: movl %ebx, (%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 4(%ebp)
-; FALLBACK21-NEXT: addl $188, %esp
-; FALLBACK21-NEXT: popl %esi
-; FALLBACK21-NEXT: popl %edi
-; FALLBACK21-NEXT: popl %ebx
-; FALLBACK21-NEXT: popl %ebp
-; FALLBACK21-NEXT: retl
-;
-; FALLBACK22-LABEL: lshr_64bytes:
-; FALLBACK22: # %bb.0:
-; FALLBACK22-NEXT: pushl %ebp
-; FALLBACK22-NEXT: pushl %ebx
-; FALLBACK22-NEXT: pushl %edi
-; FALLBACK22-NEXT: pushl %esi
-; FALLBACK22-NEXT: subl $204, %esp
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK22-NEXT: movups (%ecx), %xmm0
-; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK22-NEXT: movups 48(%ecx), %xmm3
-; FALLBACK22-NEXT: movl (%eax), %ecx
-; FALLBACK22-NEXT: xorps %xmm4, %xmm4
-; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: leal (,%ecx,8), %edx
-; FALLBACK22-NEXT: andl $24, %edx
-; FALLBACK22-NEXT: andl $60, %ecx
-; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi
-; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, %esi, %edi
-; FALLBACK22-NEXT: movl %edx, %ebx
-; FALLBACK22-NEXT: notb %bl
-; FALLBACK22-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp
-; FALLBACK22-NEXT: orl %edi, %ebp
-; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
-; FALLBACK22-NEXT: addl %esi, %esi
-; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT: orl %edi, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi
-; FALLBACK22-NEXT: leal (%esi,%esi), %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi
-; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK22-NEXT: orl %eax, %edi
-; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: leal (%eax,%eax), %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi
-; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: orl %esi, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi
-; FALLBACK22-NEXT: leal (%esi,%esi), %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi
-; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK22-NEXT: orl %eax, %edi
-; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: leal (%eax,%eax), %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi
-; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: orl %esi, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl %ecx, %eax
-; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi
-; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx
-; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; FALLBACK22-NEXT: addl %esi, %esi
-; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT: orl %ecx, %esi
-; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp
-; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx
-; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax
-; FALLBACK22-NEXT: shrxl %edx, %eax, %edi
-; FALLBACK22-NEXT: orl %edi, %ecx
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: addl %eax, %eax
-; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi
-; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp
-; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx
-; FALLBACK22-NEXT: addl %ebp, %ebp
-; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx
-; FALLBACK22-NEXT: orl %eax, %ebx
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl %edx, 60(%eax)
-; FALLBACK22-NEXT: movl %ebx, 56(%eax)
-; FALLBACK22-NEXT: movl %edi, 48(%eax)
-; FALLBACK22-NEXT: movl %ecx, 52(%eax)
-; FALLBACK22-NEXT: movl %esi, 40(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 44(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 32(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 36(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 24(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 28(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 16(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 20(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 8(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 12(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, (%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 4(%eax)
-; FALLBACK22-NEXT: addl $204, %esp
-; FALLBACK22-NEXT: popl %esi
-; FALLBACK22-NEXT: popl %edi
-; FALLBACK22-NEXT: popl %ebx
-; FALLBACK22-NEXT: popl %ebp
-; FALLBACK22-NEXT: retl
-;
-; FALLBACK23-LABEL: lshr_64bytes:
-; FALLBACK23: # %bb.0:
-; FALLBACK23-NEXT: pushl %ebp
-; FALLBACK23-NEXT: pushl %ebx
-; FALLBACK23-NEXT: pushl %edi
-; FALLBACK23-NEXT: pushl %esi
-; FALLBACK23-NEXT: subl $188, %esp
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK23-NEXT: movups (%ecx), %xmm0
-; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK23-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK23-NEXT: movups 48(%ecx), %xmm3
-; FALLBACK23-NEXT: movl (%eax), %ecx
-; FALLBACK23-NEXT: xorps %xmm4, %xmm4
-; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %ecx, %ebp
-; FALLBACK23-NEXT: andl $60, %ebp
-; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shll $3, %ecx
-; FALLBACK23-NEXT: andl $24, %ecx
-; FALLBACK23-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %esi
-; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx
-; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi
-; FALLBACK23-NEXT: movl %edi, %edx
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi
-; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp
-; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK23-NEXT: movl %eax, 56(%ebp)
-; FALLBACK23-NEXT: movl %esi, 48(%ebp)
-; FALLBACK23-NEXT: movl %edx, 52(%ebp)
-; FALLBACK23-NEXT: movl %ebx, 40(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 44(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 32(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 36(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 24(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 28(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 16(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 20(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 8(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 12(%ebp)
-; FALLBACK23-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK23-NEXT: movl %edi, (%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 4(%ebp)
-; FALLBACK23-NEXT: movl %eax, 60(%ebp)
-; FALLBACK23-NEXT: addl $188, %esp
-; FALLBACK23-NEXT: popl %esi
-; FALLBACK23-NEXT: popl %edi
-; FALLBACK23-NEXT: popl %ebx
-; FALLBACK23-NEXT: popl %ebp
-; FALLBACK23-NEXT: retl
-;
-; FALLBACK24-LABEL: lshr_64bytes:
-; FALLBACK24: # %bb.0:
-; FALLBACK24-NEXT: pushl %ebp
-; FALLBACK24-NEXT: pushl %ebx
-; FALLBACK24-NEXT: pushl %edi
-; FALLBACK24-NEXT: pushl %esi
-; FALLBACK24-NEXT: subl $204, %esp
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1
-; FALLBACK24-NEXT: movl (%eax), %ecx
-; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, %esi
-; FALLBACK24-NEXT: andl $60, %esi
-; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx
-; FALLBACK24-NEXT: shll $3, %ecx
-; FALLBACK24-NEXT: andl $24, %ecx
-; FALLBACK24-NEXT: movl %edx, %edi
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: leal (%eax,%eax), %ebx
-; FALLBACK24-NEXT: movl %ecx, %ebp
-; FALLBACK24-NEXT: movb %cl, %ch
-; FALLBACK24-NEXT: notb %ch
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %edi, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi
-; FALLBACK24-NEXT: movl %ebp, %eax
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: addl %edx, %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %edx
-; FALLBACK24-NEXT: orl %edi, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx
-; FALLBACK24-NEXT: movl %edx, %ebp
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi
-; FALLBACK24-NEXT: leal (%edi,%edi), %ebx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %ebp, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: addl %edx, %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %edx
-; FALLBACK24-NEXT: orl %ebx, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %ebp
-; FALLBACK24-NEXT: movl %eax, %edx
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: addl %eax, %eax
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %eax
-; FALLBACK24-NEXT: orl %ebp, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: addl %ebx, %ebx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %edi, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %ebp
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi
-; FALLBACK24-NEXT: leal (%edi,%edi), %eax
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %eax
-; FALLBACK24-NEXT: orl %ebp, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: addl %ebx, %ebx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %eax, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %ebp
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx
-; FALLBACK24-NEXT: leal (%edx,%edx), %eax
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %eax
-; FALLBACK24-NEXT: orl %ebp, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: addl %ebx, %ebx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %edi, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi
-; FALLBACK24-NEXT: movl %edi, %ebp
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx
-; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %ebp, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %edx
-; FALLBACK24-NEXT: addl %edi, %edi
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %edi
-; FALLBACK24-NEXT: orl %edx, %edi
-; FALLBACK24-NEXT: movl %esi, %edx
-; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi
-; FALLBACK24-NEXT: movl %esi, %ebx
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax
-; FALLBACK24-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebp
-; FALLBACK24-NEXT: orl %ebx, %ebp
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: addl %esi, %esi
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: orl %ebx, %esi
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx
-; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %edx
-; FALLBACK24-NEXT: orl %eax, %edx
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl %ebx, 60(%eax)
-; FALLBACK24-NEXT: movl %edx, 56(%eax)
-; FALLBACK24-NEXT: movl %esi, 48(%eax)
-; FALLBACK24-NEXT: movl %ebp, 52(%eax)
-; FALLBACK24-NEXT: movl %edi, 40(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 44(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 32(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 36(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 24(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 28(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 16(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 20(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 8(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 12(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, (%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 4(%eax)
-; FALLBACK24-NEXT: addl $204, %esp
-; FALLBACK24-NEXT: popl %esi
-; FALLBACK24-NEXT: popl %edi
-; FALLBACK24-NEXT: popl %ebx
-; FALLBACK24-NEXT: popl %ebp
-; FALLBACK24-NEXT: vzeroupper
-; FALLBACK24-NEXT: retl
-;
-; FALLBACK25-LABEL: lshr_64bytes:
-; FALLBACK25: # %bb.0:
-; FALLBACK25-NEXT: pushl %ebp
-; FALLBACK25-NEXT: pushl %ebx
-; FALLBACK25-NEXT: pushl %edi
-; FALLBACK25-NEXT: pushl %esi
-; FALLBACK25-NEXT: subl $188, %esp
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1
-; FALLBACK25-NEXT: movl (%eax), %ecx
-; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %ecx, %ebp
-; FALLBACK25-NEXT: andl $60, %ebp
-; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shll $3, %ecx
-; FALLBACK25-NEXT: andl $24, %ecx
-; FALLBACK25-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %esi
-; FALLBACK25-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi
-; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl %esi, %edx
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edi
-; FALLBACK25-NEXT: shrdl %cl, %esi, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edi
-; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx
-; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK25-NEXT: movl %edx, 56(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK25-NEXT: shrl %cl, %eax
-; FALLBACK25-NEXT: movl %eax, 60(%ebp)
-; FALLBACK25-NEXT: movl %esi, 48(%ebp)
-; FALLBACK25-NEXT: movl %edi, 52(%ebp)
-; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 40(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 44(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 32(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 36(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 24(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 28(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 16(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 20(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 8(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 12(%ebp)
-; FALLBACK25-NEXT: movl %ebx, (%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 4(%ebp)
-; FALLBACK25-NEXT: addl $188, %esp
-; FALLBACK25-NEXT: popl %esi
-; FALLBACK25-NEXT: popl %edi
-; FALLBACK25-NEXT: popl %ebx
-; FALLBACK25-NEXT: popl %ebp
-; FALLBACK25-NEXT: vzeroupper
-; FALLBACK25-NEXT: retl
-;
-; FALLBACK26-LABEL: lshr_64bytes:
-; FALLBACK26: # %bb.0:
-; FALLBACK26-NEXT: pushl %ebp
-; FALLBACK26-NEXT: pushl %ebx
-; FALLBACK26-NEXT: pushl %edi
-; FALLBACK26-NEXT: pushl %esi
-; FALLBACK26-NEXT: subl $204, %esp
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1
-; FALLBACK26-NEXT: movl (%eax), %ecx
-; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: leal (,%ecx,8), %edx
-; FALLBACK26-NEXT: andl $24, %edx
-; FALLBACK26-NEXT: andl $60, %ecx
-; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi
-; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, %esi, %edi
-; FALLBACK26-NEXT: movl %edx, %ebx
-; FALLBACK26-NEXT: notb %bl
-; FALLBACK26-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp
-; FALLBACK26-NEXT: orl %edi, %ebp
-; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
-; FALLBACK26-NEXT: addl %esi, %esi
-; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK26-NEXT: orl %edi, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi
-; FALLBACK26-NEXT: leal (%esi,%esi), %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi
-; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK26-NEXT: orl %eax, %edi
-; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: leal (%eax,%eax), %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi
-; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: orl %esi, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi
-; FALLBACK26-NEXT: leal (%esi,%esi), %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi
-; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK26-NEXT: orl %eax, %edi
-; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: leal (%eax,%eax), %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi
-; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: orl %esi, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: leal (%eax,%eax), %esi
-; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi
-; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: addl %esi, %esi
-; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK26-NEXT: orl %eax, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp
-; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax
-; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi
-; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax
-; FALLBACK26-NEXT: shrxl %edx, %eax, %edi
-; FALLBACK26-NEXT: orl %edi, %esi
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: addl %eax, %eax
-; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi
-; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax
-; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx
-; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx
-; FALLBACK26-NEXT: addl %ecx, %ecx
-; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx
-; FALLBACK26-NEXT: orl %eax, %ebx
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT: movl %edx, 60(%ecx)
-; FALLBACK26-NEXT: movl %ebx, 56(%ecx)
-; FALLBACK26-NEXT: movl %edi, 48(%ecx)
-; FALLBACK26-NEXT: movl %esi, 52(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 40(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 44(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 32(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 36(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 24(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 28(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 16(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 20(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 8(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 12(%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, (%ecx)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: movl %eax, 4(%ecx)
-; FALLBACK26-NEXT: addl $204, %esp
-; FALLBACK26-NEXT: popl %esi
-; FALLBACK26-NEXT: popl %edi
-; FALLBACK26-NEXT: popl %ebx
-; FALLBACK26-NEXT: popl %ebp
-; FALLBACK26-NEXT: vzeroupper
-; FALLBACK26-NEXT: retl
-;
-; FALLBACK27-LABEL: lshr_64bytes:
-; FALLBACK27: # %bb.0:
-; FALLBACK27-NEXT: pushl %ebp
-; FALLBACK27-NEXT: pushl %ebx
-; FALLBACK27-NEXT: pushl %edi
-; FALLBACK27-NEXT: pushl %esi
-; FALLBACK27-NEXT: subl $188, %esp
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1
-; FALLBACK27-NEXT: movl (%eax), %ecx
-; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %ecx, %ebp
-; FALLBACK27-NEXT: andl $60, %ebp
-; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shll $3, %ecx
-; FALLBACK27-NEXT: andl $24, %ecx
-; FALLBACK27-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %esi
-; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx
-; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi
-; FALLBACK27-NEXT: movl %edi, %edx
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi
-; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp
-; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK27-NEXT: movl %eax, 56(%ebp)
-; FALLBACK27-NEXT: movl %esi, 48(%ebp)
-; FALLBACK27-NEXT: movl %edx, 52(%ebp)
-; FALLBACK27-NEXT: movl %ebx, 40(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 44(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 32(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 36(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 24(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 28(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 16(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 20(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 8(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 12(%ebp)
-; FALLBACK27-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK27-NEXT: movl %edi, (%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 4(%ebp)
-; FALLBACK27-NEXT: movl %eax, 60(%ebp)
-; FALLBACK27-NEXT: addl $188, %esp
-; FALLBACK27-NEXT: popl %esi
-; FALLBACK27-NEXT: popl %edi
-; FALLBACK27-NEXT: popl %ebx
-; FALLBACK27-NEXT: popl %ebp
-; FALLBACK27-NEXT: vzeroupper
-; FALLBACK27-NEXT: retl
-;
-; FALLBACK28-LABEL: lshr_64bytes:
-; FALLBACK28: # %bb.0:
-; FALLBACK28-NEXT: pushl %ebp
-; FALLBACK28-NEXT: pushl %ebx
-; FALLBACK28-NEXT: pushl %edi
-; FALLBACK28-NEXT: pushl %esi
-; FALLBACK28-NEXT: subl $204, %esp
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: vmovups (%ecx), %zmm0
-; FALLBACK28-NEXT: movl (%eax), %ecx
-; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, %esi
-; FALLBACK28-NEXT: andl $60, %esi
-; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx
-; FALLBACK28-NEXT: shll $3, %ecx
-; FALLBACK28-NEXT: andl $24, %ecx
-; FALLBACK28-NEXT: movl %edx, %edi
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: leal (%eax,%eax), %ebx
-; FALLBACK28-NEXT: movl %ecx, %ebp
-; FALLBACK28-NEXT: movb %cl, %ch
-; FALLBACK28-NEXT: notb %ch
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %edi, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi
-; FALLBACK28-NEXT: movl %ebp, %eax
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: addl %edx, %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %edx
-; FALLBACK28-NEXT: orl %edi, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx
-; FALLBACK28-NEXT: movl %edx, %ebp
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi
-; FALLBACK28-NEXT: leal (%edi,%edi), %ebx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %ebp, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: addl %edx, %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %edx
-; FALLBACK28-NEXT: orl %ebx, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %ebp
-; FALLBACK28-NEXT: movl %eax, %edx
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: addl %eax, %eax
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %eax
-; FALLBACK28-NEXT: orl %ebp, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: addl %ebx, %ebx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %edi, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %ebp
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi
-; FALLBACK28-NEXT: leal (%edi,%edi), %eax
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %eax
-; FALLBACK28-NEXT: orl %ebp, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: addl %ebx, %ebx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %eax, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %ebp
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx
-; FALLBACK28-NEXT: leal (%edx,%edx), %eax
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %eax
-; FALLBACK28-NEXT: orl %ebp, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: addl %ebx, %ebx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %edi, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi
-; FALLBACK28-NEXT: movl %edi, %ebp
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx
-; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %ebp, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %edx
-; FALLBACK28-NEXT: addl %edi, %edi
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %edi
-; FALLBACK28-NEXT: orl %edx, %edi
-; FALLBACK28-NEXT: movl %esi, %edx
-; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi
-; FALLBACK28-NEXT: movl %esi, %ebx
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax
-; FALLBACK28-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebp
-; FALLBACK28-NEXT: orl %ebx, %ebp
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: addl %esi, %esi
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: orl %ebx, %esi
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx
-; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %edx
-; FALLBACK28-NEXT: orl %eax, %edx
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl %ebx, 60(%eax)
-; FALLBACK28-NEXT: movl %edx, 56(%eax)
-; FALLBACK28-NEXT: movl %esi, 48(%eax)
-; FALLBACK28-NEXT: movl %ebp, 52(%eax)
-; FALLBACK28-NEXT: movl %edi, 40(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 44(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 32(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 36(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 24(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 28(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 16(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 20(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 8(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 12(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, (%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 4(%eax)
-; FALLBACK28-NEXT: addl $204, %esp
-; FALLBACK28-NEXT: popl %esi
-; FALLBACK28-NEXT: popl %edi
-; FALLBACK28-NEXT: popl %ebx
-; FALLBACK28-NEXT: popl %ebp
-; FALLBACK28-NEXT: vzeroupper
-; FALLBACK28-NEXT: retl
-;
-; FALLBACK29-LABEL: lshr_64bytes:
-; FALLBACK29: # %bb.0:
-; FALLBACK29-NEXT: pushl %ebp
-; FALLBACK29-NEXT: pushl %ebx
-; FALLBACK29-NEXT: pushl %edi
-; FALLBACK29-NEXT: pushl %esi
-; FALLBACK29-NEXT: subl $188, %esp
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK29-NEXT: vmovups (%ecx), %zmm0
-; FALLBACK29-NEXT: movl (%eax), %ecx
-; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %ecx, %ebp
-; FALLBACK29-NEXT: andl $60, %ebp
-; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shll $3, %ecx
-; FALLBACK29-NEXT: andl $24, %ecx
-; FALLBACK29-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %esi
-; FALLBACK29-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi
-; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl %esi, %edx
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edi
-; FALLBACK29-NEXT: shrdl %cl, %esi, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edi
-; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx
-; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK29-NEXT: movl %edx, 56(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK29-NEXT: shrl %cl, %eax
-; FALLBACK29-NEXT: movl %eax, 60(%ebp)
-; FALLBACK29-NEXT: movl %esi, 48(%ebp)
-; FALLBACK29-NEXT: movl %edi, 52(%ebp)
-; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 40(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 44(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 32(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 36(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 24(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 28(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 16(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 20(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 8(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 12(%ebp)
-; FALLBACK29-NEXT: movl %ebx, (%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 4(%ebp)
-; FALLBACK29-NEXT: addl $188, %esp
-; FALLBACK29-NEXT: popl %esi
-; FALLBACK29-NEXT: popl %edi
-; FALLBACK29-NEXT: popl %ebx
-; FALLBACK29-NEXT: popl %ebp
-; FALLBACK29-NEXT: vzeroupper
-; FALLBACK29-NEXT: retl
-;
-; FALLBACK30-LABEL: lshr_64bytes:
-; FALLBACK30: # %bb.0:
-; FALLBACK30-NEXT: pushl %ebp
-; FALLBACK30-NEXT: pushl %ebx
-; FALLBACK30-NEXT: pushl %edi
-; FALLBACK30-NEXT: pushl %esi
-; FALLBACK30-NEXT: subl $204, %esp
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT: vmovups (%ecx), %zmm0
-; FALLBACK30-NEXT: movl (%eax), %edx
-; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: leal (,%edx,8), %ecx
-; FALLBACK30-NEXT: andl $24, %ecx
-; FALLBACK30-NEXT: andl $60, %edx
-; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi
-; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi
-; FALLBACK30-NEXT: movl %ecx, %ebx
-; FALLBACK30-NEXT: notb %bl
-; FALLBACK30-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp
-; FALLBACK30-NEXT: orl %edi, %ebp
-; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi
-; FALLBACK30-NEXT: addl %esi, %esi
-; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK30-NEXT: orl %edi, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi
-; FALLBACK30-NEXT: leal (%esi,%esi), %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi
-; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK30-NEXT: orl %eax, %edi
-; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: leal (%eax,%eax), %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi
-; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: orl %esi, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi
-; FALLBACK30-NEXT: leal (%esi,%esi), %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi
-; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK30-NEXT: orl %eax, %edi
-; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: leal (%eax,%eax), %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi
-; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: orl %esi, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: leal (%eax,%eax), %esi
-; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi
-; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: addl %esi, %esi
-; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK30-NEXT: orl %eax, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp
-; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax
-; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi
-; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax
-; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi
-; FALLBACK30-NEXT: orl %edi, %esi
-; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: addl %eax, %eax
-; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi
-; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax
-; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx
-; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp
-; FALLBACK30-NEXT: leal (%edx,%edx), %ecx
-; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx
-; FALLBACK30-NEXT: orl %eax, %edx
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT: movl %ebp, 60(%ecx)
-; FALLBACK30-NEXT: movl %edx, 56(%ecx)
-; FALLBACK30-NEXT: movl %edi, 48(%ecx)
-; FALLBACK30-NEXT: movl %esi, 52(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 40(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 44(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 32(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 36(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 24(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 28(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 16(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 20(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 8(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 12(%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, (%ecx)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: movl %eax, 4(%ecx)
-; FALLBACK30-NEXT: addl $204, %esp
-; FALLBACK30-NEXT: popl %esi
-; FALLBACK30-NEXT: popl %edi
-; FALLBACK30-NEXT: popl %ebx
-; FALLBACK30-NEXT: popl %ebp
-; FALLBACK30-NEXT: vzeroupper
-; FALLBACK30-NEXT: retl
-;
-; FALLBACK31-LABEL: lshr_64bytes:
-; FALLBACK31: # %bb.0:
-; FALLBACK31-NEXT: pushl %ebp
-; FALLBACK31-NEXT: pushl %ebx
-; FALLBACK31-NEXT: pushl %edi
-; FALLBACK31-NEXT: pushl %esi
-; FALLBACK31-NEXT: subl $188, %esp
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK31-NEXT: vmovups (%ecx), %zmm0
-; FALLBACK31-NEXT: movl (%eax), %ecx
-; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %ecx, %ebp
-; FALLBACK31-NEXT: andl $60, %ebp
-; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shll $3, %ecx
-; FALLBACK31-NEXT: andl $24, %ecx
-; FALLBACK31-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %esi
-; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx
-; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi
-; FALLBACK31-NEXT: movl %edi, %edx
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi
-; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp
-; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK31-NEXT: movl %eax, 56(%ebp)
-; FALLBACK31-NEXT: movl %esi, 48(%ebp)
-; FALLBACK31-NEXT: movl %edx, 52(%ebp)
-; FALLBACK31-NEXT: movl %ebx, 40(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 44(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 32(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 36(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 24(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 28(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 16(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 20(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 8(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 12(%ebp)
-; FALLBACK31-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK31-NEXT: movl %edi, (%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 4(%ebp)
-; FALLBACK31-NEXT: movl %eax, 60(%ebp)
-; FALLBACK31-NEXT: addl $188, %esp
-; FALLBACK31-NEXT: popl %esi
-; FALLBACK31-NEXT: popl %edi
-; FALLBACK31-NEXT: popl %ebx
-; FALLBACK31-NEXT: popl %ebp
-; FALLBACK31-NEXT: vzeroupper
-; FALLBACK31-NEXT: retl
+; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
+; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %edi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rdi,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %edi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r8, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rdi), %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r13,%r13), %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rdi,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r8, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r10, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r15, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rax,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r14, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r12, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes:
+; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %r8d
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%r8,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %r8d
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%r8), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%r8), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%r8), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%r8), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r14, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%r8), %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%r8), %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r14, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%r8), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r8,%r8), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rbx, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r13, %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r12, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rax,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rcx, %r11, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes:
+; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %r9d
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%r9,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %r9d
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r14, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%r9), %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r14, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r9,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r8, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rsi), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r14,%r14), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %rbx, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rsi), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r13, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r12, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%rsi,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rcx, %r11, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %r9d
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%r9,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %r9d
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r8,%r8), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%r9), %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r14, %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r13
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%r9), %rbp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r14, %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r9,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r8, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %edi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rdi,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %edi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%rdi), %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rsi, %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%rdi), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%rdi), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%rdi), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%rdi), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rsi), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r14,%r14), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rbx, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rsi), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r13, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r12, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%rsi,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r9, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rcx, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq
+;
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%eax), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 112(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 116(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 120(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ecx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ecx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%ecx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ecx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%ebx,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 112(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 120(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 116(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 124(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebp, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 52(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: lshr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%ecx), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%ecx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%ecx), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%ecx), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%ecx), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%ecx), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes:
+; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 112(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 116(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 120(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: lshr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%ebx,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, 64(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 112(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ebx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 120(%esp,%ebx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%edi,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 116(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 124(%esp,%ebx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebx, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebp, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 52(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: lshr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes:
+; X86-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %cl, %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 112(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 116(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 120(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX1-LABEL: lshr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%ecx,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 112(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 120(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 116(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 124(%esp,%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 60(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 56(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 48(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 52(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, 40(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, (%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 4(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: lshr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %cl, %ch
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 112(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 116(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 120(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ecx,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %ebp, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 112(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 120(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 116(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 124(%esp,%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 48(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, (%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%ecx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -16020,3770 +13714,3774 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
}
define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; FALLBACK0-LABEL: shl_64bytes:
-; FALLBACK0: # %bb.0:
-; FALLBACK0-NEXT: pushq %r15
-; FALLBACK0-NEXT: pushq %r14
-; FALLBACK0-NEXT: pushq %r13
-; FALLBACK0-NEXT: pushq %r12
-; FALLBACK0-NEXT: pushq %rbx
-; FALLBACK0-NEXT: movq (%rdi), %rax
-; FALLBACK0-NEXT: movq 8(%rdi), %rcx
-; FALLBACK0-NEXT: movq 16(%rdi), %r8
-; FALLBACK0-NEXT: movq 24(%rdi), %r9
-; FALLBACK0-NEXT: movq 32(%rdi), %r10
-; FALLBACK0-NEXT: movq 40(%rdi), %r11
-; FALLBACK0-NEXT: movq 48(%rdi), %rbx
-; FALLBACK0-NEXT: movq 56(%rdi), %rdi
-; FALLBACK0-NEXT: movl (%rsi), %esi
-; FALLBACK0-NEXT: xorps %xmm0, %xmm0
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: leal (,%rsi,8), %eax
-; FALLBACK0-NEXT: andl $56, %eax
-; FALLBACK0-NEXT: andl $56, %esi
-; FALLBACK0-NEXT: negl %esi
-; FALLBACK0-NEXT: movslq %esi, %rbx
-; FALLBACK0-NEXT: movq -64(%rsp,%rbx), %r8
-; FALLBACK0-NEXT: movq -56(%rsp,%rbx), %rdi
-; FALLBACK0-NEXT: movq %rdi, %r10
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r10
-; FALLBACK0-NEXT: movl %eax, %esi
-; FALLBACK0-NEXT: notb %sil
-; FALLBACK0-NEXT: movq %r8, %r9
-; FALLBACK0-NEXT: shrq %r9
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r9
-; FALLBACK0-NEXT: orq %r10, %r9
-; FALLBACK0-NEXT: movq -40(%rsp,%rbx), %r10
-; FALLBACK0-NEXT: movq %r10, %r14
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r14
-; FALLBACK0-NEXT: movq -48(%rsp,%rbx), %r15
-; FALLBACK0-NEXT: movq %r15, %r11
-; FALLBACK0-NEXT: shrq %r11
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r11
-; FALLBACK0-NEXT: orq %r14, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r15
-; FALLBACK0-NEXT: shrq %rdi
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rdi
-; FALLBACK0-NEXT: orq %r15, %rdi
-; FALLBACK0-NEXT: movq -24(%rsp,%rbx), %r14
-; FALLBACK0-NEXT: movq %r14, %r12
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r12
-; FALLBACK0-NEXT: movq -32(%rsp,%rbx), %r13
-; FALLBACK0-NEXT: movq %r13, %r15
-; FALLBACK0-NEXT: shrq %r15
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r15
-; FALLBACK0-NEXT: orq %r12, %r15
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r13
-; FALLBACK0-NEXT: shrq %r10
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r10
-; FALLBACK0-NEXT: orq %r13, %r10
-; FALLBACK0-NEXT: movq -8(%rsp,%rbx), %r12
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r12
-; FALLBACK0-NEXT: movq -16(%rsp,%rbx), %rbx
-; FALLBACK0-NEXT: movq %rbx, %r13
-; FALLBACK0-NEXT: shrq %r13
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r13
-; FALLBACK0-NEXT: orq %r12, %r13
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %rbx
-; FALLBACK0-NEXT: shrq %r14
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r14
-; FALLBACK0-NEXT: orq %rbx, %r14
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r8
-; FALLBACK0-NEXT: movq %r8, (%rdx)
-; FALLBACK0-NEXT: movq %r14, 48(%rdx)
-; FALLBACK0-NEXT: movq %r13, 56(%rdx)
-; FALLBACK0-NEXT: movq %r10, 32(%rdx)
-; FALLBACK0-NEXT: movq %r15, 40(%rdx)
-; FALLBACK0-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK0-NEXT: movq %r11, 24(%rdx)
-; FALLBACK0-NEXT: movq %r9, 8(%rdx)
-; FALLBACK0-NEXT: popq %rbx
-; FALLBACK0-NEXT: popq %r12
-; FALLBACK0-NEXT: popq %r13
-; FALLBACK0-NEXT: popq %r14
-; FALLBACK0-NEXT: popq %r15
-; FALLBACK0-NEXT: retq
-;
-; FALLBACK1-LABEL: shl_64bytes:
-; FALLBACK1: # %bb.0:
-; FALLBACK1-NEXT: pushq %r14
-; FALLBACK1-NEXT: pushq %rbx
-; FALLBACK1-NEXT: pushq %rax
-; FALLBACK1-NEXT: movq (%rdi), %rax
-; FALLBACK1-NEXT: movq 8(%rdi), %rcx
-; FALLBACK1-NEXT: movq 16(%rdi), %r8
-; FALLBACK1-NEXT: movq 24(%rdi), %r9
-; FALLBACK1-NEXT: movq 32(%rdi), %r10
-; FALLBACK1-NEXT: movq 40(%rdi), %r11
-; FALLBACK1-NEXT: movq 48(%rdi), %rbx
-; FALLBACK1-NEXT: movq 56(%rdi), %rdi
-; FALLBACK1-NEXT: movl (%rsi), %esi
-; FALLBACK1-NEXT: xorps %xmm0, %xmm0
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK1-NEXT: andl $56, %ecx
-; FALLBACK1-NEXT: andl $56, %esi
-; FALLBACK1-NEXT: negl %esi
-; FALLBACK1-NEXT: movslq %esi, %r9
-; FALLBACK1-NEXT: movq -48(%rsp,%r9), %rax
-; FALLBACK1-NEXT: movq -40(%rsp,%r9), %r10
-; FALLBACK1-NEXT: movq %r10, %rsi
-; FALLBACK1-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK1-NEXT: movq -64(%rsp,%r9), %r8
-; FALLBACK1-NEXT: movq -56(%rsp,%r9), %rdi
-; FALLBACK1-NEXT: shldq %cl, %rdi, %rax
-; FALLBACK1-NEXT: movq -32(%rsp,%r9), %r11
-; FALLBACK1-NEXT: movq -24(%rsp,%r9), %rbx
-; FALLBACK1-NEXT: movq %rbx, %r14
-; FALLBACK1-NEXT: shldq %cl, %r11, %r14
-; FALLBACK1-NEXT: shldq %cl, %r10, %r11
-; FALLBACK1-NEXT: movq -16(%rsp,%r9), %r10
-; FALLBACK1-NEXT: movq -8(%rsp,%r9), %r9
-; FALLBACK1-NEXT: shldq %cl, %r10, %r9
-; FALLBACK1-NEXT: shldq %cl, %rbx, %r10
-; FALLBACK1-NEXT: shldq %cl, %r8, %rdi
-; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK1-NEXT: shlq %cl, %r8
-; FALLBACK1-NEXT: movq %r10, 48(%rdx)
-; FALLBACK1-NEXT: movq %r9, 56(%rdx)
-; FALLBACK1-NEXT: movq %r11, 32(%rdx)
-; FALLBACK1-NEXT: movq %r14, 40(%rdx)
-; FALLBACK1-NEXT: movq %rax, 16(%rdx)
-; FALLBACK1-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK1-NEXT: movq %r8, (%rdx)
-; FALLBACK1-NEXT: movq %rdi, 8(%rdx)
-; FALLBACK1-NEXT: addq $8, %rsp
-; FALLBACK1-NEXT: popq %rbx
-; FALLBACK1-NEXT: popq %r14
-; FALLBACK1-NEXT: retq
-;
-; FALLBACK2-LABEL: shl_64bytes:
-; FALLBACK2: # %bb.0:
-; FALLBACK2-NEXT: pushq %rbp
-; FALLBACK2-NEXT: pushq %r15
-; FALLBACK2-NEXT: pushq %r14
-; FALLBACK2-NEXT: pushq %r13
-; FALLBACK2-NEXT: pushq %r12
-; FALLBACK2-NEXT: pushq %rbx
-; FALLBACK2-NEXT: pushq %rax
-; FALLBACK2-NEXT: movq (%rdi), %rax
-; FALLBACK2-NEXT: movq 8(%rdi), %rcx
-; FALLBACK2-NEXT: movq 16(%rdi), %r8
-; FALLBACK2-NEXT: movq 24(%rdi), %r9
-; FALLBACK2-NEXT: movq 32(%rdi), %r10
-; FALLBACK2-NEXT: movq 40(%rdi), %r11
-; FALLBACK2-NEXT: movq 48(%rdi), %rbx
-; FALLBACK2-NEXT: movq 56(%rdi), %rdi
-; FALLBACK2-NEXT: movl (%rsi), %esi
-; FALLBACK2-NEXT: xorps %xmm0, %xmm0
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: leal (,%rsi,8), %eax
-; FALLBACK2-NEXT: andl $56, %eax
-; FALLBACK2-NEXT: andl $56, %esi
-; FALLBACK2-NEXT: negl %esi
-; FALLBACK2-NEXT: movslq %esi, %rsi
-; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10
-; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9
-; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi
-; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11
-; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14
-; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx
-; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8
-; FALLBACK2-NEXT: shlxq %rax, %r8, %r15
-; FALLBACK2-NEXT: shlxq %rax, %r10, %r12
-; FALLBACK2-NEXT: movl %eax, %r13d
-; FALLBACK2-NEXT: notb %r13b
-; FALLBACK2-NEXT: shrq %r10
-; FALLBACK2-NEXT: shrxq %r13, %r10, %r10
-; FALLBACK2-NEXT: orq %r9, %r10
-; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9
-; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp
-; FALLBACK2-NEXT: shrq %r14
-; FALLBACK2-NEXT: shrxq %r13, %r14, %r14
-; FALLBACK2-NEXT: orq %r11, %r14
-; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11
-; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi
-; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax
-; FALLBACK2-NEXT: shrq %rcx
-; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx
-; FALLBACK2-NEXT: orq %rbx, %rcx
-; FALLBACK2-NEXT: shrq %r9
-; FALLBACK2-NEXT: shrxq %r13, %r9, %r9
-; FALLBACK2-NEXT: orq %r15, %r9
-; FALLBACK2-NEXT: shrq %rdi
-; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi
-; FALLBACK2-NEXT: orq %rbp, %rdi
-; FALLBACK2-NEXT: shrq %rsi
-; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi
-; FALLBACK2-NEXT: orq %r11, %rsi
-; FALLBACK2-NEXT: shrq %r8
-; FALLBACK2-NEXT: shrxq %r13, %r8, %r8
-; FALLBACK2-NEXT: orq %rax, %r8
-; FALLBACK2-NEXT: movq %r12, (%rdx)
-; FALLBACK2-NEXT: movq %r8, 48(%rdx)
-; FALLBACK2-NEXT: movq %rsi, 56(%rdx)
-; FALLBACK2-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK2-NEXT: movq %r9, 40(%rdx)
-; FALLBACK2-NEXT: movq %rcx, 16(%rdx)
-; FALLBACK2-NEXT: movq %r14, 24(%rdx)
-; FALLBACK2-NEXT: movq %r10, 8(%rdx)
-; FALLBACK2-NEXT: addq $8, %rsp
-; FALLBACK2-NEXT: popq %rbx
-; FALLBACK2-NEXT: popq %r12
-; FALLBACK2-NEXT: popq %r13
-; FALLBACK2-NEXT: popq %r14
-; FALLBACK2-NEXT: popq %r15
-; FALLBACK2-NEXT: popq %rbp
-; FALLBACK2-NEXT: retq
-;
-; FALLBACK3-LABEL: shl_64bytes:
-; FALLBACK3: # %bb.0:
-; FALLBACK3-NEXT: pushq %r14
-; FALLBACK3-NEXT: pushq %rbx
-; FALLBACK3-NEXT: pushq %rax
-; FALLBACK3-NEXT: movq (%rdi), %rax
-; FALLBACK3-NEXT: movq 8(%rdi), %rcx
-; FALLBACK3-NEXT: movq 16(%rdi), %r8
-; FALLBACK3-NEXT: movq 24(%rdi), %r9
-; FALLBACK3-NEXT: movq 32(%rdi), %r10
-; FALLBACK3-NEXT: movq 40(%rdi), %r11
-; FALLBACK3-NEXT: movq 48(%rdi), %rbx
-; FALLBACK3-NEXT: movq 56(%rdi), %rdi
-; FALLBACK3-NEXT: movl (%rsi), %esi
-; FALLBACK3-NEXT: xorps %xmm0, %xmm0
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: leal (,%rsi,8), %ecx
-; FALLBACK3-NEXT: andl $56, %ecx
-; FALLBACK3-NEXT: andl $56, %esi
-; FALLBACK3-NEXT: negl %esi
-; FALLBACK3-NEXT: movslq %esi, %r8
-; FALLBACK3-NEXT: movq -48(%rsp,%r8), %rax
-; FALLBACK3-NEXT: movq -40(%rsp,%r8), %r9
-; FALLBACK3-NEXT: movq %r9, %rsi
-; FALLBACK3-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK3-NEXT: movq -64(%rsp,%r8), %r10
-; FALLBACK3-NEXT: movq -56(%rsp,%r8), %rdi
-; FALLBACK3-NEXT: shldq %cl, %rdi, %rax
-; FALLBACK3-NEXT: movq -32(%rsp,%r8), %r11
-; FALLBACK3-NEXT: movq -24(%rsp,%r8), %rbx
-; FALLBACK3-NEXT: movq %rbx, %r14
-; FALLBACK3-NEXT: shldq %cl, %r11, %r14
-; FALLBACK3-NEXT: shldq %cl, %r9, %r11
-; FALLBACK3-NEXT: movq -16(%rsp,%r8), %r9
-; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r8
-; FALLBACK3-NEXT: shldq %cl, %r9, %r8
-; FALLBACK3-NEXT: shldq %cl, %rbx, %r9
-; FALLBACK3-NEXT: shldq %cl, %r10, %rdi
-; FALLBACK3-NEXT: shlxq %rcx, %r10, %rcx
-; FALLBACK3-NEXT: movq %r9, 48(%rdx)
-; FALLBACK3-NEXT: movq %r8, 56(%rdx)
-; FALLBACK3-NEXT: movq %r11, 32(%rdx)
-; FALLBACK3-NEXT: movq %r14, 40(%rdx)
-; FALLBACK3-NEXT: movq %rax, 16(%rdx)
-; FALLBACK3-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK3-NEXT: movq %rcx, (%rdx)
-; FALLBACK3-NEXT: movq %rdi, 8(%rdx)
-; FALLBACK3-NEXT: addq $8, %rsp
-; FALLBACK3-NEXT: popq %rbx
-; FALLBACK3-NEXT: popq %r14
-; FALLBACK3-NEXT: retq
-;
-; FALLBACK4-LABEL: shl_64bytes:
-; FALLBACK4: # %bb.0:
-; FALLBACK4-NEXT: pushq %r15
-; FALLBACK4-NEXT: pushq %r14
-; FALLBACK4-NEXT: pushq %r13
-; FALLBACK4-NEXT: pushq %r12
-; FALLBACK4-NEXT: pushq %rbx
-; FALLBACK4-NEXT: movups (%rdi), %xmm0
-; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK4-NEXT: movups 48(%rdi), %xmm3
-; FALLBACK4-NEXT: movl (%rsi), %ecx
-; FALLBACK4-NEXT: xorps %xmm4, %xmm4
-; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: leal (,%rcx,8), %eax
-; FALLBACK4-NEXT: andl $56, %eax
-; FALLBACK4-NEXT: andl $56, %ecx
-; FALLBACK4-NEXT: negl %ecx
-; FALLBACK4-NEXT: movslq %ecx, %r9
-; FALLBACK4-NEXT: movq -24(%rsp,%r9), %rdi
-; FALLBACK4-NEXT: movq %rdi, %r10
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r10
-; FALLBACK4-NEXT: movl %eax, %esi
-; FALLBACK4-NEXT: notb %sil
-; FALLBACK4-NEXT: movq -32(%rsp,%r9), %r11
-; FALLBACK4-NEXT: movq %r11, %r8
-; FALLBACK4-NEXT: shrq %r8
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r8
-; FALLBACK4-NEXT: orq %r10, %r8
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r11
-; FALLBACK4-NEXT: movq -40(%rsp,%r9), %rbx
-; FALLBACK4-NEXT: movq %rbx, %r10
-; FALLBACK4-NEXT: shrq %r10
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r10
-; FALLBACK4-NEXT: orq %r11, %r10
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rbx
-; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r15
-; FALLBACK4-NEXT: movq %r15, %r11
-; FALLBACK4-NEXT: shrq %r11
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r11
-; FALLBACK4-NEXT: orq %rbx, %r11
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r15
-; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r14
-; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r12
-; FALLBACK4-NEXT: movq %r12, %rbx
-; FALLBACK4-NEXT: shrq %rbx
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %rbx
-; FALLBACK4-NEXT: orq %r15, %rbx
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r12
-; FALLBACK4-NEXT: movq %r14, %r15
-; FALLBACK4-NEXT: shrq %r15
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r15
-; FALLBACK4-NEXT: orq %r12, %r15
-; FALLBACK4-NEXT: movq -16(%rsp,%r9), %r12
-; FALLBACK4-NEXT: movq %r12, %r13
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r13
-; FALLBACK4-NEXT: shrq %rdi
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %rdi
-; FALLBACK4-NEXT: orq %r13, %rdi
-; FALLBACK4-NEXT: movq -8(%rsp,%r9), %r9
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r9
-; FALLBACK4-NEXT: shrq %r12
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r12
-; FALLBACK4-NEXT: orq %r9, %r12
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r14
-; FALLBACK4-NEXT: movq %r14, (%rdx)
-; FALLBACK4-NEXT: movq %r12, 56(%rdx)
-; FALLBACK4-NEXT: movq %rdi, 48(%rdx)
-; FALLBACK4-NEXT: movq %r15, 8(%rdx)
-; FALLBACK4-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK4-NEXT: movq %r11, 24(%rdx)
-; FALLBACK4-NEXT: movq %r10, 32(%rdx)
-; FALLBACK4-NEXT: movq %r8, 40(%rdx)
-; FALLBACK4-NEXT: popq %rbx
-; FALLBACK4-NEXT: popq %r12
-; FALLBACK4-NEXT: popq %r13
-; FALLBACK4-NEXT: popq %r14
-; FALLBACK4-NEXT: popq %r15
-; FALLBACK4-NEXT: retq
-;
-; FALLBACK5-LABEL: shl_64bytes:
-; FALLBACK5: # %bb.0:
-; FALLBACK5-NEXT: pushq %r15
-; FALLBACK5-NEXT: pushq %r14
-; FALLBACK5-NEXT: pushq %rbx
-; FALLBACK5-NEXT: movups (%rdi), %xmm0
-; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK5-NEXT: movups 48(%rdi), %xmm3
-; FALLBACK5-NEXT: movl (%rsi), %eax
-; FALLBACK5-NEXT: xorps %xmm4, %xmm4
-; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: leal (,%rax,8), %ecx
-; FALLBACK5-NEXT: andl $56, %ecx
-; FALLBACK5-NEXT: andl $56, %eax
-; FALLBACK5-NEXT: negl %eax
-; FALLBACK5-NEXT: movslq %eax, %r8
-; FALLBACK5-NEXT: movq -32(%rsp,%r8), %rax
-; FALLBACK5-NEXT: movq -24(%rsp,%r8), %r9
-; FALLBACK5-NEXT: movq %r9, %rsi
-; FALLBACK5-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK5-NEXT: movq -40(%rsp,%r8), %rdi
-; FALLBACK5-NEXT: shldq %cl, %rdi, %rax
-; FALLBACK5-NEXT: movq -48(%rsp,%r8), %r10
-; FALLBACK5-NEXT: shldq %cl, %r10, %rdi
-; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r11
-; FALLBACK5-NEXT: movq -56(%rsp,%r8), %rbx
-; FALLBACK5-NEXT: shldq %cl, %rbx, %r10
-; FALLBACK5-NEXT: movq -16(%rsp,%r8), %r14
-; FALLBACK5-NEXT: movq %r14, %r15
-; FALLBACK5-NEXT: shldq %cl, %r9, %r15
-; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r8
-; FALLBACK5-NEXT: shldq %cl, %r14, %r8
-; FALLBACK5-NEXT: movq %r11, %r9
-; FALLBACK5-NEXT: shlq %cl, %r9
-; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK5-NEXT: shldq %cl, %r11, %rbx
-; FALLBACK5-NEXT: movq %r8, 56(%rdx)
-; FALLBACK5-NEXT: movq %r15, 48(%rdx)
-; FALLBACK5-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK5-NEXT: movq %r10, 16(%rdx)
-; FALLBACK5-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK5-NEXT: movq %rax, 32(%rdx)
-; FALLBACK5-NEXT: movq %rsi, 40(%rdx)
-; FALLBACK5-NEXT: movq %r9, (%rdx)
-; FALLBACK5-NEXT: popq %rbx
-; FALLBACK5-NEXT: popq %r14
-; FALLBACK5-NEXT: popq %r15
-; FALLBACK5-NEXT: retq
-;
-; FALLBACK6-LABEL: shl_64bytes:
-; FALLBACK6: # %bb.0:
-; FALLBACK6-NEXT: pushq %rbp
-; FALLBACK6-NEXT: pushq %r15
-; FALLBACK6-NEXT: pushq %r14
-; FALLBACK6-NEXT: pushq %r13
-; FALLBACK6-NEXT: pushq %r12
-; FALLBACK6-NEXT: pushq %rbx
-; FALLBACK6-NEXT: subq $24, %rsp
-; FALLBACK6-NEXT: movups (%rdi), %xmm0
-; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK6-NEXT: movups 48(%rdi), %xmm3
-; FALLBACK6-NEXT: movl (%rsi), %eax
-; FALLBACK6-NEXT: xorps %xmm4, %xmm4
-; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm3, (%rsp)
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: leal (,%rax,8), %ecx
-; FALLBACK6-NEXT: andl $56, %ecx
-; FALLBACK6-NEXT: andl $56, %eax
-; FALLBACK6-NEXT: negl %eax
-; FALLBACK6-NEXT: movslq %eax, %rsi
-; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax
-; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12
-; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi
-; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15
-; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13
-; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8
-; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11
-; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10
-; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14
-; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx
-; FALLBACK6-NEXT: movl %ecx, %r9d
-; FALLBACK6-NEXT: notb %r9b
-; FALLBACK6-NEXT: shrq %rdi
-; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi
-; FALLBACK6-NEXT: orq %r12, %rdi
-; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp
-; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8
-; FALLBACK6-NEXT: shrq %r13
-; FALLBACK6-NEXT: shrxq %r9, %r13, %r12
-; FALLBACK6-NEXT: orq %r15, %r12
-; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15
-; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi
-; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx
-; FALLBACK6-NEXT: shrq %r11
-; FALLBACK6-NEXT: shrxq %r9, %r11, %r11
-; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; FALLBACK6-NEXT: shrq %r14
-; FALLBACK6-NEXT: shrxq %r9, %r14, %r14
-; FALLBACK6-NEXT: orq %r10, %r14
-; FALLBACK6-NEXT: shrq %rsi
-; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi
-; FALLBACK6-NEXT: orq %rbx, %rsi
-; FALLBACK6-NEXT: shrq %rax
-; FALLBACK6-NEXT: shrxq %r9, %rax, %rax
-; FALLBACK6-NEXT: orq %r8, %rax
-; FALLBACK6-NEXT: shrq %rbp
-; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8
-; FALLBACK6-NEXT: orq %r15, %r8
-; FALLBACK6-NEXT: movq %rcx, (%rdx)
-; FALLBACK6-NEXT: movq %r8, 56(%rdx)
-; FALLBACK6-NEXT: movq %rax, 48(%rdx)
-; FALLBACK6-NEXT: movq %rsi, 8(%rdx)
-; FALLBACK6-NEXT: movq %r14, 16(%rdx)
-; FALLBACK6-NEXT: movq %r11, 24(%rdx)
-; FALLBACK6-NEXT: movq %r12, 32(%rdx)
-; FALLBACK6-NEXT: movq %rdi, 40(%rdx)
-; FALLBACK6-NEXT: addq $24, %rsp
-; FALLBACK6-NEXT: popq %rbx
-; FALLBACK6-NEXT: popq %r12
-; FALLBACK6-NEXT: popq %r13
-; FALLBACK6-NEXT: popq %r14
-; FALLBACK6-NEXT: popq %r15
-; FALLBACK6-NEXT: popq %rbp
-; FALLBACK6-NEXT: retq
-;
-; FALLBACK7-LABEL: shl_64bytes:
-; FALLBACK7: # %bb.0:
-; FALLBACK7-NEXT: pushq %r15
-; FALLBACK7-NEXT: pushq %r14
-; FALLBACK7-NEXT: pushq %rbx
-; FALLBACK7-NEXT: movups (%rdi), %xmm0
-; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK7-NEXT: movups 48(%rdi), %xmm3
-; FALLBACK7-NEXT: movl (%rsi), %eax
-; FALLBACK7-NEXT: xorps %xmm4, %xmm4
-; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: leal (,%rax,8), %ecx
-; FALLBACK7-NEXT: andl $56, %ecx
-; FALLBACK7-NEXT: andl $56, %eax
-; FALLBACK7-NEXT: negl %eax
-; FALLBACK7-NEXT: movslq %eax, %r8
-; FALLBACK7-NEXT: movq -32(%rsp,%r8), %rax
-; FALLBACK7-NEXT: movq -24(%rsp,%r8), %r9
-; FALLBACK7-NEXT: movq %r9, %rsi
-; FALLBACK7-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK7-NEXT: movq -40(%rsp,%r8), %rdi
-; FALLBACK7-NEXT: shldq %cl, %rdi, %rax
-; FALLBACK7-NEXT: movq -48(%rsp,%r8), %r10
-; FALLBACK7-NEXT: shldq %cl, %r10, %rdi
-; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r11
-; FALLBACK7-NEXT: movq -56(%rsp,%r8), %rbx
-; FALLBACK7-NEXT: shldq %cl, %rbx, %r10
-; FALLBACK7-NEXT: movq -16(%rsp,%r8), %r14
-; FALLBACK7-NEXT: movq %r14, %r15
-; FALLBACK7-NEXT: shldq %cl, %r9, %r15
-; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r8
-; FALLBACK7-NEXT: shldq %cl, %r14, %r8
-; FALLBACK7-NEXT: shlxq %rcx, %r11, %r9
-; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK7-NEXT: shldq %cl, %r11, %rbx
-; FALLBACK7-NEXT: movq %r8, 56(%rdx)
-; FALLBACK7-NEXT: movq %r15, 48(%rdx)
-; FALLBACK7-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK7-NEXT: movq %r10, 16(%rdx)
-; FALLBACK7-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK7-NEXT: movq %rax, 32(%rdx)
-; FALLBACK7-NEXT: movq %rsi, 40(%rdx)
-; FALLBACK7-NEXT: movq %r9, (%rdx)
-; FALLBACK7-NEXT: popq %rbx
-; FALLBACK7-NEXT: popq %r14
-; FALLBACK7-NEXT: popq %r15
-; FALLBACK7-NEXT: retq
-;
-; FALLBACK8-LABEL: shl_64bytes:
-; FALLBACK8: # %bb.0:
-; FALLBACK8-NEXT: pushq %r15
-; FALLBACK8-NEXT: pushq %r14
-; FALLBACK8-NEXT: pushq %r13
-; FALLBACK8-NEXT: pushq %r12
-; FALLBACK8-NEXT: pushq %rbx
-; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1
-; FALLBACK8-NEXT: movl (%rsi), %ecx
-; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: leal (,%rcx,8), %eax
-; FALLBACK8-NEXT: andl $56, %eax
-; FALLBACK8-NEXT: andl $56, %ecx
-; FALLBACK8-NEXT: negl %ecx
-; FALLBACK8-NEXT: movslq %ecx, %r9
-; FALLBACK8-NEXT: movq -24(%rsp,%r9), %rdi
-; FALLBACK8-NEXT: movq %rdi, %r10
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r10
-; FALLBACK8-NEXT: movl %eax, %esi
-; FALLBACK8-NEXT: notb %sil
-; FALLBACK8-NEXT: movq -32(%rsp,%r9), %r11
-; FALLBACK8-NEXT: movq %r11, %r8
-; FALLBACK8-NEXT: shrq %r8
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r8
-; FALLBACK8-NEXT: orq %r10, %r8
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r11
-; FALLBACK8-NEXT: movq -40(%rsp,%r9), %rbx
-; FALLBACK8-NEXT: movq %rbx, %r10
-; FALLBACK8-NEXT: shrq %r10
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r10
-; FALLBACK8-NEXT: orq %r11, %r10
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rbx
-; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r15
-; FALLBACK8-NEXT: movq %r15, %r11
-; FALLBACK8-NEXT: shrq %r11
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r11
-; FALLBACK8-NEXT: orq %rbx, %r11
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r15
-; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r14
-; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r12
-; FALLBACK8-NEXT: movq %r12, %rbx
-; FALLBACK8-NEXT: shrq %rbx
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %rbx
-; FALLBACK8-NEXT: orq %r15, %rbx
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r12
-; FALLBACK8-NEXT: movq %r14, %r15
-; FALLBACK8-NEXT: shrq %r15
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r15
-; FALLBACK8-NEXT: orq %r12, %r15
-; FALLBACK8-NEXT: movq -16(%rsp,%r9), %r12
-; FALLBACK8-NEXT: movq %r12, %r13
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r13
-; FALLBACK8-NEXT: shrq %rdi
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %rdi
-; FALLBACK8-NEXT: orq %r13, %rdi
-; FALLBACK8-NEXT: movq -8(%rsp,%r9), %r9
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r9
-; FALLBACK8-NEXT: shrq %r12
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r12
-; FALLBACK8-NEXT: orq %r9, %r12
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r14
-; FALLBACK8-NEXT: movq %r14, (%rdx)
-; FALLBACK8-NEXT: movq %r12, 56(%rdx)
-; FALLBACK8-NEXT: movq %rdi, 48(%rdx)
-; FALLBACK8-NEXT: movq %r15, 8(%rdx)
-; FALLBACK8-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK8-NEXT: movq %r11, 24(%rdx)
-; FALLBACK8-NEXT: movq %r10, 32(%rdx)
-; FALLBACK8-NEXT: movq %r8, 40(%rdx)
-; FALLBACK8-NEXT: popq %rbx
-; FALLBACK8-NEXT: popq %r12
-; FALLBACK8-NEXT: popq %r13
-; FALLBACK8-NEXT: popq %r14
-; FALLBACK8-NEXT: popq %r15
-; FALLBACK8-NEXT: vzeroupper
-; FALLBACK8-NEXT: retq
-;
-; FALLBACK9-LABEL: shl_64bytes:
-; FALLBACK9: # %bb.0:
-; FALLBACK9-NEXT: pushq %r15
-; FALLBACK9-NEXT: pushq %r14
-; FALLBACK9-NEXT: pushq %rbx
-; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1
-; FALLBACK9-NEXT: movl (%rsi), %eax
-; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: leal (,%rax,8), %ecx
-; FALLBACK9-NEXT: andl $56, %ecx
-; FALLBACK9-NEXT: andl $56, %eax
-; FALLBACK9-NEXT: negl %eax
-; FALLBACK9-NEXT: movslq %eax, %r8
-; FALLBACK9-NEXT: movq -32(%rsp,%r8), %rax
-; FALLBACK9-NEXT: movq -24(%rsp,%r8), %r9
-; FALLBACK9-NEXT: movq %r9, %rsi
-; FALLBACK9-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK9-NEXT: movq -40(%rsp,%r8), %rdi
-; FALLBACK9-NEXT: shldq %cl, %rdi, %rax
-; FALLBACK9-NEXT: movq -48(%rsp,%r8), %r10
-; FALLBACK9-NEXT: shldq %cl, %r10, %rdi
-; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r11
-; FALLBACK9-NEXT: movq -56(%rsp,%r8), %rbx
-; FALLBACK9-NEXT: shldq %cl, %rbx, %r10
-; FALLBACK9-NEXT: movq -16(%rsp,%r8), %r14
-; FALLBACK9-NEXT: movq %r14, %r15
-; FALLBACK9-NEXT: shldq %cl, %r9, %r15
-; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r8
-; FALLBACK9-NEXT: shldq %cl, %r14, %r8
-; FALLBACK9-NEXT: movq %r11, %r9
-; FALLBACK9-NEXT: shlq %cl, %r9
-; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK9-NEXT: shldq %cl, %r11, %rbx
-; FALLBACK9-NEXT: movq %r8, 56(%rdx)
-; FALLBACK9-NEXT: movq %r15, 48(%rdx)
-; FALLBACK9-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK9-NEXT: movq %r10, 16(%rdx)
-; FALLBACK9-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK9-NEXT: movq %rax, 32(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 40(%rdx)
-; FALLBACK9-NEXT: movq %r9, (%rdx)
-; FALLBACK9-NEXT: popq %rbx
-; FALLBACK9-NEXT: popq %r14
-; FALLBACK9-NEXT: popq %r15
-; FALLBACK9-NEXT: vzeroupper
-; FALLBACK9-NEXT: retq
-;
-; FALLBACK10-LABEL: shl_64bytes:
-; FALLBACK10: # %bb.0:
-; FALLBACK10-NEXT: pushq %rbp
-; FALLBACK10-NEXT: pushq %r15
-; FALLBACK10-NEXT: pushq %r14
-; FALLBACK10-NEXT: pushq %r13
-; FALLBACK10-NEXT: pushq %r12
-; FALLBACK10-NEXT: pushq %rbx
-; FALLBACK10-NEXT: subq $24, %rsp
-; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1
-; FALLBACK10-NEXT: movl (%rsi), %eax
-; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: leal (,%rax,8), %ecx
-; FALLBACK10-NEXT: andl $56, %ecx
-; FALLBACK10-NEXT: andl $56, %eax
-; FALLBACK10-NEXT: negl %eax
-; FALLBACK10-NEXT: movslq %eax, %rsi
-; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax
-; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12
-; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi
-; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15
-; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13
-; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8
-; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11
-; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10
-; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14
-; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx
-; FALLBACK10-NEXT: movl %ecx, %r9d
-; FALLBACK10-NEXT: notb %r9b
-; FALLBACK10-NEXT: shrq %rdi
-; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi
-; FALLBACK10-NEXT: orq %r12, %rdi
-; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp
-; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8
-; FALLBACK10-NEXT: shrq %r13
-; FALLBACK10-NEXT: shrxq %r9, %r13, %r12
-; FALLBACK10-NEXT: orq %r15, %r12
-; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15
-; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi
-; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx
-; FALLBACK10-NEXT: shrq %r11
-; FALLBACK10-NEXT: shrxq %r9, %r11, %r11
-; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; FALLBACK10-NEXT: shrq %r14
-; FALLBACK10-NEXT: shrxq %r9, %r14, %r14
-; FALLBACK10-NEXT: orq %r10, %r14
-; FALLBACK10-NEXT: shrq %rsi
-; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi
-; FALLBACK10-NEXT: orq %rbx, %rsi
-; FALLBACK10-NEXT: shrq %rax
-; FALLBACK10-NEXT: shrxq %r9, %rax, %rax
-; FALLBACK10-NEXT: orq %r8, %rax
-; FALLBACK10-NEXT: shrq %rbp
-; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8
-; FALLBACK10-NEXT: orq %r15, %r8
-; FALLBACK10-NEXT: movq %rcx, (%rdx)
-; FALLBACK10-NEXT: movq %r8, 56(%rdx)
-; FALLBACK10-NEXT: movq %rax, 48(%rdx)
-; FALLBACK10-NEXT: movq %rsi, 8(%rdx)
-; FALLBACK10-NEXT: movq %r14, 16(%rdx)
-; FALLBACK10-NEXT: movq %r11, 24(%rdx)
-; FALLBACK10-NEXT: movq %r12, 32(%rdx)
-; FALLBACK10-NEXT: movq %rdi, 40(%rdx)
-; FALLBACK10-NEXT: addq $24, %rsp
-; FALLBACK10-NEXT: popq %rbx
-; FALLBACK10-NEXT: popq %r12
-; FALLBACK10-NEXT: popq %r13
-; FALLBACK10-NEXT: popq %r14
-; FALLBACK10-NEXT: popq %r15
-; FALLBACK10-NEXT: popq %rbp
-; FALLBACK10-NEXT: vzeroupper
-; FALLBACK10-NEXT: retq
-;
-; FALLBACK11-LABEL: shl_64bytes:
-; FALLBACK11: # %bb.0:
-; FALLBACK11-NEXT: pushq %r15
-; FALLBACK11-NEXT: pushq %r14
-; FALLBACK11-NEXT: pushq %rbx
-; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1
-; FALLBACK11-NEXT: movl (%rsi), %eax
-; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: leal (,%rax,8), %ecx
-; FALLBACK11-NEXT: andl $56, %ecx
-; FALLBACK11-NEXT: andl $56, %eax
-; FALLBACK11-NEXT: negl %eax
-; FALLBACK11-NEXT: movslq %eax, %r8
-; FALLBACK11-NEXT: movq -32(%rsp,%r8), %rax
-; FALLBACK11-NEXT: movq -24(%rsp,%r8), %r9
-; FALLBACK11-NEXT: movq %r9, %rsi
-; FALLBACK11-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK11-NEXT: movq -40(%rsp,%r8), %rdi
-; FALLBACK11-NEXT: shldq %cl, %rdi, %rax
-; FALLBACK11-NEXT: movq -48(%rsp,%r8), %r10
-; FALLBACK11-NEXT: shldq %cl, %r10, %rdi
-; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r11
-; FALLBACK11-NEXT: movq -56(%rsp,%r8), %rbx
-; FALLBACK11-NEXT: shldq %cl, %rbx, %r10
-; FALLBACK11-NEXT: movq -16(%rsp,%r8), %r14
-; FALLBACK11-NEXT: movq %r14, %r15
-; FALLBACK11-NEXT: shldq %cl, %r9, %r15
-; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r8
-; FALLBACK11-NEXT: shldq %cl, %r14, %r8
-; FALLBACK11-NEXT: shlxq %rcx, %r11, %r9
-; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK11-NEXT: shldq %cl, %r11, %rbx
-; FALLBACK11-NEXT: movq %r8, 56(%rdx)
-; FALLBACK11-NEXT: movq %r15, 48(%rdx)
-; FALLBACK11-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK11-NEXT: movq %r10, 16(%rdx)
-; FALLBACK11-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK11-NEXT: movq %rax, 32(%rdx)
-; FALLBACK11-NEXT: movq %rsi, 40(%rdx)
-; FALLBACK11-NEXT: movq %r9, (%rdx)
-; FALLBACK11-NEXT: popq %rbx
-; FALLBACK11-NEXT: popq %r14
-; FALLBACK11-NEXT: popq %r15
-; FALLBACK11-NEXT: vzeroupper
-; FALLBACK11-NEXT: retq
-;
-; FALLBACK12-LABEL: shl_64bytes:
-; FALLBACK12: # %bb.0:
-; FALLBACK12-NEXT: pushq %r15
-; FALLBACK12-NEXT: pushq %r14
-; FALLBACK12-NEXT: pushq %r13
-; FALLBACK12-NEXT: pushq %r12
-; FALLBACK12-NEXT: pushq %rbx
-; FALLBACK12-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK12-NEXT: movl (%rsi), %ecx
-; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: leal (,%rcx,8), %eax
-; FALLBACK12-NEXT: andl $56, %eax
-; FALLBACK12-NEXT: andl $56, %ecx
-; FALLBACK12-NEXT: negl %ecx
-; FALLBACK12-NEXT: movslq %ecx, %r9
-; FALLBACK12-NEXT: movq -24(%rsp,%r9), %rdi
-; FALLBACK12-NEXT: movq %rdi, %r10
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r10
-; FALLBACK12-NEXT: movl %eax, %esi
-; FALLBACK12-NEXT: notb %sil
-; FALLBACK12-NEXT: movq -32(%rsp,%r9), %r11
-; FALLBACK12-NEXT: movq %r11, %r8
-; FALLBACK12-NEXT: shrq %r8
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r8
-; FALLBACK12-NEXT: orq %r10, %r8
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r11
-; FALLBACK12-NEXT: movq -40(%rsp,%r9), %rbx
-; FALLBACK12-NEXT: movq %rbx, %r10
-; FALLBACK12-NEXT: shrq %r10
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: orq %r11, %r10
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rbx
-; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r15
-; FALLBACK12-NEXT: movq %r15, %r11
-; FALLBACK12-NEXT: shrq %r11
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r11
-; FALLBACK12-NEXT: orq %rbx, %r11
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r15
-; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r14
-; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r12
-; FALLBACK12-NEXT: movq %r12, %rbx
-; FALLBACK12-NEXT: shrq %rbx
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %rbx
-; FALLBACK12-NEXT: orq %r15, %rbx
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r12
-; FALLBACK12-NEXT: movq %r14, %r15
-; FALLBACK12-NEXT: shrq %r15
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r15
-; FALLBACK12-NEXT: orq %r12, %r15
-; FALLBACK12-NEXT: movq -16(%rsp,%r9), %r12
-; FALLBACK12-NEXT: movq %r12, %r13
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r13
-; FALLBACK12-NEXT: shrq %rdi
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %rdi
-; FALLBACK12-NEXT: orq %r13, %rdi
-; FALLBACK12-NEXT: movq -8(%rsp,%r9), %r9
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r9
-; FALLBACK12-NEXT: shrq %r12
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r12
-; FALLBACK12-NEXT: orq %r9, %r12
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r14
-; FALLBACK12-NEXT: movq %r14, (%rdx)
-; FALLBACK12-NEXT: movq %r12, 56(%rdx)
-; FALLBACK12-NEXT: movq %rdi, 48(%rdx)
-; FALLBACK12-NEXT: movq %r15, 8(%rdx)
-; FALLBACK12-NEXT: movq %rbx, 16(%rdx)
-; FALLBACK12-NEXT: movq %r11, 24(%rdx)
-; FALLBACK12-NEXT: movq %r10, 32(%rdx)
-; FALLBACK12-NEXT: movq %r8, 40(%rdx)
-; FALLBACK12-NEXT: popq %rbx
-; FALLBACK12-NEXT: popq %r12
-; FALLBACK12-NEXT: popq %r13
-; FALLBACK12-NEXT: popq %r14
-; FALLBACK12-NEXT: popq %r15
-; FALLBACK12-NEXT: vzeroupper
-; FALLBACK12-NEXT: retq
-;
-; FALLBACK13-LABEL: shl_64bytes:
-; FALLBACK13: # %bb.0:
-; FALLBACK13-NEXT: pushq %r15
-; FALLBACK13-NEXT: pushq %r14
-; FALLBACK13-NEXT: pushq %rbx
-; FALLBACK13-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK13-NEXT: movl (%rsi), %eax
-; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: leal (,%rax,8), %ecx
-; FALLBACK13-NEXT: andl $56, %ecx
-; FALLBACK13-NEXT: andl $56, %eax
-; FALLBACK13-NEXT: negl %eax
-; FALLBACK13-NEXT: movslq %eax, %r8
-; FALLBACK13-NEXT: movq -32(%rsp,%r8), %rax
-; FALLBACK13-NEXT: movq -24(%rsp,%r8), %r9
-; FALLBACK13-NEXT: movq %r9, %rsi
-; FALLBACK13-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK13-NEXT: movq -40(%rsp,%r8), %rdi
-; FALLBACK13-NEXT: shldq %cl, %rdi, %rax
-; FALLBACK13-NEXT: movq -48(%rsp,%r8), %r10
-; FALLBACK13-NEXT: shldq %cl, %r10, %rdi
-; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r11
-; FALLBACK13-NEXT: movq -56(%rsp,%r8), %rbx
-; FALLBACK13-NEXT: shldq %cl, %rbx, %r10
-; FALLBACK13-NEXT: movq -16(%rsp,%r8), %r14
-; FALLBACK13-NEXT: movq %r14, %r15
-; FALLBACK13-NEXT: shldq %cl, %r9, %r15
-; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r8
-; FALLBACK13-NEXT: shldq %cl, %r14, %r8
-; FALLBACK13-NEXT: movq %r11, %r9
-; FALLBACK13-NEXT: shlq %cl, %r9
-; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK13-NEXT: shldq %cl, %r11, %rbx
-; FALLBACK13-NEXT: movq %r8, 56(%rdx)
-; FALLBACK13-NEXT: movq %r15, 48(%rdx)
-; FALLBACK13-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK13-NEXT: movq %r10, 16(%rdx)
-; FALLBACK13-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK13-NEXT: movq %rax, 32(%rdx)
-; FALLBACK13-NEXT: movq %rsi, 40(%rdx)
-; FALLBACK13-NEXT: movq %r9, (%rdx)
-; FALLBACK13-NEXT: popq %rbx
-; FALLBACK13-NEXT: popq %r14
-; FALLBACK13-NEXT: popq %r15
-; FALLBACK13-NEXT: vzeroupper
-; FALLBACK13-NEXT: retq
-;
-; FALLBACK14-LABEL: shl_64bytes:
-; FALLBACK14: # %bb.0:
-; FALLBACK14-NEXT: pushq %rbp
-; FALLBACK14-NEXT: pushq %r15
-; FALLBACK14-NEXT: pushq %r14
-; FALLBACK14-NEXT: pushq %r13
-; FALLBACK14-NEXT: pushq %r12
-; FALLBACK14-NEXT: pushq %rbx
-; FALLBACK14-NEXT: subq $24, %rsp
-; FALLBACK14-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK14-NEXT: movl (%rsi), %eax
-; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: leal (,%rax,8), %ecx
-; FALLBACK14-NEXT: andl $56, %ecx
-; FALLBACK14-NEXT: andl $56, %eax
-; FALLBACK14-NEXT: negl %eax
-; FALLBACK14-NEXT: movslq %eax, %rsi
-; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax
-; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12
-; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi
-; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15
-; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13
-; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8
-; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11
-; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10
-; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14
-; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx
-; FALLBACK14-NEXT: movl %ecx, %r9d
-; FALLBACK14-NEXT: notb %r9b
-; FALLBACK14-NEXT: shrq %rdi
-; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi
-; FALLBACK14-NEXT: orq %r12, %rdi
-; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp
-; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8
-; FALLBACK14-NEXT: shrq %r13
-; FALLBACK14-NEXT: shrxq %r9, %r13, %r12
-; FALLBACK14-NEXT: orq %r15, %r12
-; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15
-; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi
-; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx
-; FALLBACK14-NEXT: shrq %r11
-; FALLBACK14-NEXT: shrxq %r9, %r11, %r11
-; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; FALLBACK14-NEXT: shrq %r14
-; FALLBACK14-NEXT: shrxq %r9, %r14, %r14
-; FALLBACK14-NEXT: orq %r10, %r14
-; FALLBACK14-NEXT: shrq %rsi
-; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi
-; FALLBACK14-NEXT: orq %rbx, %rsi
-; FALLBACK14-NEXT: shrq %rax
-; FALLBACK14-NEXT: shrxq %r9, %rax, %rax
-; FALLBACK14-NEXT: orq %r8, %rax
-; FALLBACK14-NEXT: shrq %rbp
-; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8
-; FALLBACK14-NEXT: orq %r15, %r8
-; FALLBACK14-NEXT: movq %rcx, (%rdx)
-; FALLBACK14-NEXT: movq %r8, 56(%rdx)
-; FALLBACK14-NEXT: movq %rax, 48(%rdx)
-; FALLBACK14-NEXT: movq %rsi, 8(%rdx)
-; FALLBACK14-NEXT: movq %r14, 16(%rdx)
-; FALLBACK14-NEXT: movq %r11, 24(%rdx)
-; FALLBACK14-NEXT: movq %r12, 32(%rdx)
-; FALLBACK14-NEXT: movq %rdi, 40(%rdx)
-; FALLBACK14-NEXT: addq $24, %rsp
-; FALLBACK14-NEXT: popq %rbx
-; FALLBACK14-NEXT: popq %r12
-; FALLBACK14-NEXT: popq %r13
-; FALLBACK14-NEXT: popq %r14
-; FALLBACK14-NEXT: popq %r15
-; FALLBACK14-NEXT: popq %rbp
-; FALLBACK14-NEXT: vzeroupper
-; FALLBACK14-NEXT: retq
-;
-; FALLBACK15-LABEL: shl_64bytes:
-; FALLBACK15: # %bb.0:
-; FALLBACK15-NEXT: pushq %r15
-; FALLBACK15-NEXT: pushq %r14
-; FALLBACK15-NEXT: pushq %rbx
-; FALLBACK15-NEXT: vmovups (%rdi), %zmm0
-; FALLBACK15-NEXT: movl (%rsi), %eax
-; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: leal (,%rax,8), %ecx
-; FALLBACK15-NEXT: andl $56, %ecx
-; FALLBACK15-NEXT: andl $56, %eax
-; FALLBACK15-NEXT: negl %eax
-; FALLBACK15-NEXT: movslq %eax, %r8
-; FALLBACK15-NEXT: movq -32(%rsp,%r8), %rax
-; FALLBACK15-NEXT: movq -24(%rsp,%r8), %r9
-; FALLBACK15-NEXT: movq %r9, %rsi
-; FALLBACK15-NEXT: shldq %cl, %rax, %rsi
-; FALLBACK15-NEXT: movq -40(%rsp,%r8), %rdi
-; FALLBACK15-NEXT: shldq %cl, %rdi, %rax
-; FALLBACK15-NEXT: movq -48(%rsp,%r8), %r10
-; FALLBACK15-NEXT: shldq %cl, %r10, %rdi
-; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r11
-; FALLBACK15-NEXT: movq -56(%rsp,%r8), %rbx
-; FALLBACK15-NEXT: shldq %cl, %rbx, %r10
-; FALLBACK15-NEXT: movq -16(%rsp,%r8), %r14
-; FALLBACK15-NEXT: movq %r14, %r15
-; FALLBACK15-NEXT: shldq %cl, %r9, %r15
-; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r8
-; FALLBACK15-NEXT: shldq %cl, %r14, %r8
-; FALLBACK15-NEXT: shlxq %rcx, %r11, %r9
-; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK15-NEXT: shldq %cl, %r11, %rbx
-; FALLBACK15-NEXT: movq %r8, 56(%rdx)
-; FALLBACK15-NEXT: movq %r15, 48(%rdx)
-; FALLBACK15-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK15-NEXT: movq %r10, 16(%rdx)
-; FALLBACK15-NEXT: movq %rdi, 24(%rdx)
-; FALLBACK15-NEXT: movq %rax, 32(%rdx)
-; FALLBACK15-NEXT: movq %rsi, 40(%rdx)
-; FALLBACK15-NEXT: movq %r9, (%rdx)
-; FALLBACK15-NEXT: popq %rbx
-; FALLBACK15-NEXT: popq %r14
-; FALLBACK15-NEXT: popq %r15
-; FALLBACK15-NEXT: vzeroupper
-; FALLBACK15-NEXT: retq
-;
-; FALLBACK16-LABEL: shl_64bytes:
-; FALLBACK16: # %bb.0:
-; FALLBACK16-NEXT: pushl %ebp
-; FALLBACK16-NEXT: pushl %ebx
-; FALLBACK16-NEXT: pushl %edi
-; FALLBACK16-NEXT: pushl %esi
-; FALLBACK16-NEXT: subl $204, %esp
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl (%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 4(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 8(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 12(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 16(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 20(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 24(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 28(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 32(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 36(%eax), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 40(%eax), %ebp
-; FALLBACK16-NEXT: movl 44(%eax), %ebx
-; FALLBACK16-NEXT: movl 48(%eax), %edi
-; FALLBACK16-NEXT: movl 52(%eax), %esi
-; FALLBACK16-NEXT: movl 56(%eax), %edx
-; FALLBACK16-NEXT: movl 60(%eax), %ecx
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl (%eax), %eax
-; FALLBACK16-NEXT: xorps %xmm0, %xmm0
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %eax, %edx
-; FALLBACK16-NEXT: andl $60, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT: subl %edx, %ecx
-; FALLBACK16-NEXT: movl (%ecx), %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 4(%ecx), %edx
-; FALLBACK16-NEXT: movl %ecx, %ebp
-; FALLBACK16-NEXT: shll $3, %eax
-; FALLBACK16-NEXT: andl $24, %eax
-; FALLBACK16-NEXT: movl %edx, %esi
-; FALLBACK16-NEXT: movl %eax, %ecx
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: shrl %edi
-; FALLBACK16-NEXT: movb %al, %ch
-; FALLBACK16-NEXT: notb %ch
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: orl %esi, %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 12(%ebp), %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: movl 8(%ebp), %esi
-; FALLBACK16-NEXT: movl %ebp, %edi
-; FALLBACK16-NEXT: movl %esi, %ebp
-; FALLBACK16-NEXT: shrl %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: orl %ebx, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: shrl %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edx
-; FALLBACK16-NEXT: orl %esi, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl %edi, %ebp
-; FALLBACK16-NEXT: movl 20(%edi), %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: movl 16(%edi), %esi
-; FALLBACK16-NEXT: movl %esi, %edx
-; FALLBACK16-NEXT: shrl %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edx
-; FALLBACK16-NEXT: orl %ebx, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK16-NEXT: shrl %edi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: orl %esi, %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl %ebp, %edx
-; FALLBACK16-NEXT: movl 28(%ebp), %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: movl 24(%ebp), %esi
-; FALLBACK16-NEXT: movl %esi, %edi
-; FALLBACK16-NEXT: shrl %edi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: orl %ebx, %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK16-NEXT: shrl %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: orl %esi, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 36(%edx), %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: movl 32(%edx), %esi
-; FALLBACK16-NEXT: movl %edx, %ebp
-; FALLBACK16-NEXT: movl %esi, %edi
-; FALLBACK16-NEXT: shrl %edi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: orl %ebx, %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: shrl %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edx
-; FALLBACK16-NEXT: orl %esi, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 44(%ebp), %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: movl 40(%ebp), %esi
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl %esi, %edx
-; FALLBACK16-NEXT: shrl %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edx
-; FALLBACK16-NEXT: orl %ebx, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: shrl %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edx
-; FALLBACK16-NEXT: orl %esi, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 52(%ebp), %esi
-; FALLBACK16-NEXT: movl %esi, %edi
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: negl %edx
-; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx
-; FALLBACK16-NEXT: movl %ebx, %ebp
-; FALLBACK16-NEXT: shrl %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %ebp
-; FALLBACK16-NEXT: orl %edi, %ebp
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: shrl %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edx
-; FALLBACK16-NEXT: orl %ebx, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK16-NEXT: movl 60(%edi), %edx
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: movl 56(%edi), %ebx
-; FALLBACK16-NEXT: movl %ebx, %edi
-; FALLBACK16-NEXT: shrl %edi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: orl %edx, %edi
-; FALLBACK16-NEXT: movb %al, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: shrl %esi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shrl %cl, %esi
-; FALLBACK16-NEXT: orl %ebx, %esi
-; FALLBACK16-NEXT: movl %eax, %ecx
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl %edx, (%eax)
-; FALLBACK16-NEXT: movl %esi, 56(%eax)
-; FALLBACK16-NEXT: movl %edi, 60(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 48(%eax)
-; FALLBACK16-NEXT: movl %ebp, 52(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 40(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 44(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 32(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 36(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 24(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 28(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 16(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 20(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 8(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 12(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 4(%eax)
-; FALLBACK16-NEXT: addl $204, %esp
-; FALLBACK16-NEXT: popl %esi
-; FALLBACK16-NEXT: popl %edi
-; FALLBACK16-NEXT: popl %ebx
-; FALLBACK16-NEXT: popl %ebp
-; FALLBACK16-NEXT: retl
-;
-; FALLBACK17-LABEL: shl_64bytes:
-; FALLBACK17: # %bb.0:
-; FALLBACK17-NEXT: pushl %ebp
-; FALLBACK17-NEXT: pushl %ebx
-; FALLBACK17-NEXT: pushl %edi
-; FALLBACK17-NEXT: pushl %esi
-; FALLBACK17-NEXT: subl $188, %esp
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl (%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 4(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 8(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 12(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 16(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 20(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 24(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 28(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 32(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 36(%ecx), %eax
-; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: movl 40(%ecx), %ebp
-; FALLBACK17-NEXT: movl 44(%ecx), %ebx
-; FALLBACK17-NEXT: movl 48(%ecx), %edi
-; FALLBACK17-NEXT: movl 52(%ecx), %esi
-; FALLBACK17-NEXT: movl 56(%ecx), %edx
-; FALLBACK17-NEXT: movl 60(%ecx), %eax
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl (%ecx), %ecx
-; FALLBACK17-NEXT: xorps %xmm0, %xmm0
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ecx, %ebp
-; FALLBACK17-NEXT: andl $60, %ebp
-; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK17-NEXT: subl %ebp, %eax
-; FALLBACK17-NEXT: movl 8(%eax), %esi
-; FALLBACK17-NEXT: movl 12(%eax), %edx
-; FALLBACK17-NEXT: shll $3, %ecx
-; FALLBACK17-NEXT: andl $24, %ecx
-; FALLBACK17-NEXT: movl %edx, %edi
-; FALLBACK17-NEXT: shldl %cl, %esi, %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 4(%eax), %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shldl %cl, %edi, %esi
-; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 16(%eax), %edi
-; FALLBACK17-NEXT: movl 20(%eax), %esi
-; FALLBACK17-NEXT: movl %esi, %ebx
-; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shldl %cl, %edx, %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 24(%eax), %edi
-; FALLBACK17-NEXT: movl 28(%eax), %edx
-; FALLBACK17-NEXT: movl %edx, %ebx
-; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shldl %cl, %esi, %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 32(%eax), %edi
-; FALLBACK17-NEXT: movl 36(%eax), %esi
-; FALLBACK17-NEXT: movl %esi, %ebx
-; FALLBACK17-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shldl %cl, %edx, %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 40(%eax), %edx
-; FALLBACK17-NEXT: movl 44(%eax), %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shldl %cl, %edx, %edi
-; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: shldl %cl, %esi, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 56(%eax), %edx
-; FALLBACK17-NEXT: movl 60(%eax), %edi
-; FALLBACK17-NEXT: shldl %cl, %edx, %edi
-; FALLBACK17-NEXT: movl (%eax), %ebx
-; FALLBACK17-NEXT: movl 52(%eax), %esi
-; FALLBACK17-NEXT: shldl %cl, %esi, %edx
-; FALLBACK17-NEXT: negl %ebp
-; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK17-NEXT: movl %edx, 56(%ebp)
-; FALLBACK17-NEXT: movl %edi, 60(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: shldl %cl, %ebx, %edx
-; FALLBACK17-NEXT: shll %cl, %ebx
-; FALLBACK17-NEXT: shldl %cl, %eax, %esi
-; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK17-NEXT: shldl %cl, %edi, %eax
-; FALLBACK17-NEXT: movl %eax, 48(%ebp)
-; FALLBACK17-NEXT: movl %esi, 52(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 40(%ebp)
-; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 44(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 32(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 36(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 24(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 28(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 16(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 20(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 8(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 12(%ebp)
-; FALLBACK17-NEXT: movl %ebx, (%ebp)
-; FALLBACK17-NEXT: movl %edx, 4(%ebp)
-; FALLBACK17-NEXT: addl $188, %esp
-; FALLBACK17-NEXT: popl %esi
-; FALLBACK17-NEXT: popl %edi
-; FALLBACK17-NEXT: popl %ebx
-; FALLBACK17-NEXT: popl %ebp
-; FALLBACK17-NEXT: retl
-;
-; FALLBACK18-LABEL: shl_64bytes:
-; FALLBACK18: # %bb.0:
-; FALLBACK18-NEXT: pushl %ebp
-; FALLBACK18-NEXT: pushl %ebx
-; FALLBACK18-NEXT: pushl %edi
-; FALLBACK18-NEXT: pushl %esi
-; FALLBACK18-NEXT: subl $204, %esp
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl (%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 4(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 8(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 12(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 16(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 20(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 24(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 28(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 32(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 36(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 40(%eax), %ebx
-; FALLBACK18-NEXT: movl 44(%eax), %edi
-; FALLBACK18-NEXT: movl 48(%eax), %esi
-; FALLBACK18-NEXT: movl 52(%eax), %edx
-; FALLBACK18-NEXT: movl 56(%eax), %ecx
-; FALLBACK18-NEXT: movl 60(%eax), %eax
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK18-NEXT: movl (%ebp), %ebp
-; FALLBACK18-NEXT: xorps %xmm0, %xmm0
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: leal (,%ebp,8), %edx
-; FALLBACK18-NEXT: andl $24, %edx
-; FALLBACK18-NEXT: andl $60, %ebp
-; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi
-; FALLBACK18-NEXT: subl %ebp, %edi
-; FALLBACK18-NEXT: movl (%edi), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 4(%edi), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl %edx, %ebx
-; FALLBACK18-NEXT: notb %bl
-; FALLBACK18-NEXT: shrl %ecx
-; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi
-; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx
-; FALLBACK18-NEXT: orl %ecx, %esi
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 8(%edi), %esi
-; FALLBACK18-NEXT: movl %esi, %ecx
-; FALLBACK18-NEXT: shrl %ecx
-; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK18-NEXT: movl 12(%edi), %ecx
-; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, %esi, %esi
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: shrl %eax
-; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT: orl %esi, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 16(%edi), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrl %eax
-; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT: movl 20(%edi), %esi
-; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrl %ecx
-; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT: orl %eax, %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 24(%edi), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrl %ecx
-; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK18-NEXT: movl 28(%edi), %ecx
-; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrl %esi
-; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT: orl %eax, %esi
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 32(%edi), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrl %eax
-; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT: movl 36(%edi), %esi
-; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrl %ecx
-; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT: orl %eax, %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 40(%edi), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrl %ecx
-; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK18-NEXT: movl 44(%edi), %ecx
-; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrl %esi
-; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT: orl %eax, %esi
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 48(%edi), %esi
-; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrl %esi
-; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax
-; FALLBACK18-NEXT: movl 52(%edi), %esi
-; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrl %ecx
-; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp
-; FALLBACK18-NEXT: orl %eax, %ebp
-; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK18-NEXT: negl %eax
-; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK18-NEXT: movl 56(%edi), %eax
-; FALLBACK18-NEXT: shlxl %edx, %eax, %edx
-; FALLBACK18-NEXT: shrl %esi
-; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK18-NEXT: orl %edx, %esi
-; FALLBACK18-NEXT: shrl %eax
-; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK18-NEXT: orl %eax, %ecx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, (%eax)
-; FALLBACK18-NEXT: movl %esi, 56(%eax)
-; FALLBACK18-NEXT: movl %ecx, 60(%eax)
-; FALLBACK18-NEXT: movl %ebp, 48(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 52(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 40(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 44(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 32(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 36(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 24(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 28(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 16(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 20(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 8(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 12(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 4(%eax)
-; FALLBACK18-NEXT: addl $204, %esp
-; FALLBACK18-NEXT: popl %esi
-; FALLBACK18-NEXT: popl %edi
-; FALLBACK18-NEXT: popl %ebx
-; FALLBACK18-NEXT: popl %ebp
-; FALLBACK18-NEXT: retl
-;
-; FALLBACK19-LABEL: shl_64bytes:
-; FALLBACK19: # %bb.0:
-; FALLBACK19-NEXT: pushl %ebp
-; FALLBACK19-NEXT: pushl %ebx
-; FALLBACK19-NEXT: pushl %edi
-; FALLBACK19-NEXT: pushl %esi
-; FALLBACK19-NEXT: subl $204, %esp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK19-NEXT: movl (%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 4(%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 8(%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 12(%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 16(%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 20(%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 24(%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 28(%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 32(%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 36(%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 40(%ebp), %ebx
-; FALLBACK19-NEXT: movl 44(%ebp), %edi
-; FALLBACK19-NEXT: movl 48(%ebp), %esi
-; FALLBACK19-NEXT: movl 52(%ebp), %edx
-; FALLBACK19-NEXT: movl 56(%ebp), %ecx
-; FALLBACK19-NEXT: movl 60(%ebp), %eax
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK19-NEXT: movl (%ebp), %ebp
-; FALLBACK19-NEXT: xorps %xmm0, %xmm0
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: leal (,%ebp,8), %ecx
-; FALLBACK19-NEXT: andl $24, %ecx
-; FALLBACK19-NEXT: andl $60, %ebp
-; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK19-NEXT: subl %ebp, %eax
-; FALLBACK19-NEXT: movl 4(%eax), %esi
-; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 8(%eax), %edi
-; FALLBACK19-NEXT: movl 12(%eax), %edx
-; FALLBACK19-NEXT: movl %edx, %ebx
-; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shldl %cl, %esi, %edi
-; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 16(%eax), %edi
-; FALLBACK19-NEXT: movl 20(%eax), %esi
-; FALLBACK19-NEXT: movl %esi, %ebx
-; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shldl %cl, %edx, %edi
-; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 24(%eax), %edi
-; FALLBACK19-NEXT: movl 28(%eax), %edx
-; FALLBACK19-NEXT: movl %edx, %ebx
-; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shldl %cl, %esi, %edi
-; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 32(%eax), %edi
-; FALLBACK19-NEXT: movl 36(%eax), %esi
-; FALLBACK19-NEXT: movl %esi, %ebx
-; FALLBACK19-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shldl %cl, %edx, %edi
-; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 40(%eax), %ebx
-; FALLBACK19-NEXT: movl 44(%eax), %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shldl %cl, %ebx, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shldl %cl, %esi, %ebx
-; FALLBACK19-NEXT: movl 56(%eax), %edx
-; FALLBACK19-NEXT: movl 60(%eax), %edi
-; FALLBACK19-NEXT: shldl %cl, %edx, %edi
-; FALLBACK19-NEXT: movl (%eax), %esi
-; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 52(%eax), %esi
-; FALLBACK19-NEXT: shldl %cl, %esi, %edx
-; FALLBACK19-NEXT: negl %ebp
-; FALLBACK19-NEXT: movl 176(%esp,%ebp), %ebp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK19-NEXT: movl %edx, 56(%eax)
-; FALLBACK19-NEXT: movl %edi, 60(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: shlxl %ecx, %edx, %edi
-; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK19-NEXT: shldl %cl, %edx, %edi
-; FALLBACK19-NEXT: shldl %cl, %ebp, %esi
-; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: shldl %cl, %edx, %ebp
-; FALLBACK19-NEXT: movl %ebp, 48(%eax)
-; FALLBACK19-NEXT: movl %esi, 52(%eax)
-; FALLBACK19-NEXT: movl %ebx, 40(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 44(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 32(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 36(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 24(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 28(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 16(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 20(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 8(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 12(%eax)
-; FALLBACK19-NEXT: movl %edi, 4(%eax)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, (%eax)
-; FALLBACK19-NEXT: addl $204, %esp
-; FALLBACK19-NEXT: popl %esi
-; FALLBACK19-NEXT: popl %edi
-; FALLBACK19-NEXT: popl %ebx
-; FALLBACK19-NEXT: popl %ebp
-; FALLBACK19-NEXT: retl
-;
-; FALLBACK20-LABEL: shl_64bytes:
-; FALLBACK20: # %bb.0:
-; FALLBACK20-NEXT: pushl %ebp
-; FALLBACK20-NEXT: pushl %ebx
-; FALLBACK20-NEXT: pushl %edi
-; FALLBACK20-NEXT: pushl %esi
-; FALLBACK20-NEXT: subl $204, %esp
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: movups (%ecx), %xmm0
-; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK20-NEXT: movups 48(%ecx), %xmm3
-; FALLBACK20-NEXT: movl (%eax), %eax
-; FALLBACK20-NEXT: xorps %xmm4, %xmm4
-; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %eax, %edx
-; FALLBACK20-NEXT: andl $60, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: subl %edx, %ecx
-; FALLBACK20-NEXT: movl (%ecx), %edi
-; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 4(%ecx), %edx
-; FALLBACK20-NEXT: movl %ecx, %ebp
-; FALLBACK20-NEXT: shll $3, %eax
-; FALLBACK20-NEXT: andl $24, %eax
-; FALLBACK20-NEXT: movl %edx, %esi
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: shrl %edi
-; FALLBACK20-NEXT: movb %al, %ch
-; FALLBACK20-NEXT: notb %ch
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: orl %esi, %edi
-; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 12(%ebp), %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: movl 8(%ebp), %esi
-; FALLBACK20-NEXT: movl %ebp, %edi
-; FALLBACK20-NEXT: movl %esi, %ebp
-; FALLBACK20-NEXT: shrl %ebp
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: orl %ebx, %ebp
-; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: shrl %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edx
-; FALLBACK20-NEXT: orl %esi, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl %edi, %ebp
-; FALLBACK20-NEXT: movl 20(%edi), %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: movl 16(%edi), %esi
-; FALLBACK20-NEXT: movl %esi, %edx
-; FALLBACK20-NEXT: shrl %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edx
-; FALLBACK20-NEXT: orl %ebx, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK20-NEXT: shrl %edi
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: orl %esi, %edi
-; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl %ebp, %edx
-; FALLBACK20-NEXT: movl 28(%ebp), %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: movl 24(%ebp), %esi
-; FALLBACK20-NEXT: movl %esi, %edi
-; FALLBACK20-NEXT: shrl %edi
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: orl %ebx, %edi
-; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK20-NEXT: shrl %ebp
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: orl %esi, %ebp
-; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 36(%edx), %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: movl 32(%edx), %esi
-; FALLBACK20-NEXT: movl %edx, %ebp
-; FALLBACK20-NEXT: movl %esi, %edi
-; FALLBACK20-NEXT: shrl %edi
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: orl %ebx, %edi
-; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: shrl %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edx
-; FALLBACK20-NEXT: orl %esi, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 44(%ebp), %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: movl 40(%ebp), %esi
-; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl %esi, %edx
-; FALLBACK20-NEXT: shrl %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edx
-; FALLBACK20-NEXT: orl %ebx, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: shrl %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edx
-; FALLBACK20-NEXT: orl %esi, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 52(%ebp), %esi
-; FALLBACK20-NEXT: movl %esi, %edi
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %edi
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: negl %edx
-; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx
-; FALLBACK20-NEXT: movl %ebx, %ebp
-; FALLBACK20-NEXT: shrl %ebp
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: orl %edi, %ebp
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: shrl %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edx
-; FALLBACK20-NEXT: orl %ebx, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK20-NEXT: movl 60(%edi), %edx
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %edx
-; FALLBACK20-NEXT: movl 56(%edi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %edi
-; FALLBACK20-NEXT: shrl %edi
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: orl %edx, %edi
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: shrl %esi
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shrl %cl, %esi
-; FALLBACK20-NEXT: orl %ebx, %esi
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: shll %cl, %edx
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl %edx, (%eax)
-; FALLBACK20-NEXT: movl %esi, 56(%eax)
-; FALLBACK20-NEXT: movl %edi, 60(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 48(%eax)
-; FALLBACK20-NEXT: movl %ebp, 52(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 40(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 44(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 32(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 36(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 24(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 28(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 16(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 20(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 8(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 12(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 4(%eax)
-; FALLBACK20-NEXT: addl $204, %esp
-; FALLBACK20-NEXT: popl %esi
-; FALLBACK20-NEXT: popl %edi
-; FALLBACK20-NEXT: popl %ebx
-; FALLBACK20-NEXT: popl %ebp
-; FALLBACK20-NEXT: retl
-;
-; FALLBACK21-LABEL: shl_64bytes:
-; FALLBACK21: # %bb.0:
-; FALLBACK21-NEXT: pushl %ebp
-; FALLBACK21-NEXT: pushl %ebx
-; FALLBACK21-NEXT: pushl %edi
-; FALLBACK21-NEXT: pushl %esi
-; FALLBACK21-NEXT: subl $188, %esp
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK21-NEXT: movups (%ecx), %xmm0
-; FALLBACK21-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK21-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK21-NEXT: movups 48(%ecx), %xmm3
-; FALLBACK21-NEXT: movl (%eax), %ecx
-; FALLBACK21-NEXT: xorps %xmm4, %xmm4
-; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %ecx, %ebp
-; FALLBACK21-NEXT: andl $60, %ebp
-; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK21-NEXT: subl %ebp, %eax
-; FALLBACK21-NEXT: movl 8(%eax), %esi
-; FALLBACK21-NEXT: movl 12(%eax), %edx
-; FALLBACK21-NEXT: shll $3, %ecx
-; FALLBACK21-NEXT: andl $24, %ecx
-; FALLBACK21-NEXT: movl %edx, %edi
-; FALLBACK21-NEXT: shldl %cl, %esi, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 4(%eax), %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shldl %cl, %edi, %esi
-; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 16(%eax), %edi
-; FALLBACK21-NEXT: movl 20(%eax), %esi
-; FALLBACK21-NEXT: movl %esi, %ebx
-; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shldl %cl, %edx, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 24(%eax), %edi
-; FALLBACK21-NEXT: movl 28(%eax), %edx
-; FALLBACK21-NEXT: movl %edx, %ebx
-; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shldl %cl, %esi, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 32(%eax), %edi
-; FALLBACK21-NEXT: movl 36(%eax), %esi
-; FALLBACK21-NEXT: movl %esi, %ebx
-; FALLBACK21-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shldl %cl, %edx, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 40(%eax), %edx
-; FALLBACK21-NEXT: movl 44(%eax), %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shldl %cl, %edx, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shldl %cl, %esi, %edx
-; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK21-NEXT: movl 56(%eax), %edx
-; FALLBACK21-NEXT: movl 60(%eax), %edi
-; FALLBACK21-NEXT: shldl %cl, %edx, %edi
-; FALLBACK21-NEXT: movl (%eax), %ebx
-; FALLBACK21-NEXT: movl 52(%eax), %esi
-; FALLBACK21-NEXT: shldl %cl, %esi, %edx
-; FALLBACK21-NEXT: negl %ebp
-; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK21-NEXT: movl %edx, 56(%ebp)
-; FALLBACK21-NEXT: movl %edi, 60(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK21-NEXT: shldl %cl, %ebx, %edx
-; FALLBACK21-NEXT: shll %cl, %ebx
-; FALLBACK21-NEXT: shldl %cl, %eax, %esi
-; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK21-NEXT: shldl %cl, %edi, %eax
-; FALLBACK21-NEXT: movl %eax, 48(%ebp)
-; FALLBACK21-NEXT: movl %esi, 52(%ebp)
-; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 40(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 44(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 32(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 36(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 24(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 28(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 16(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 20(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 8(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 12(%ebp)
-; FALLBACK21-NEXT: movl %ebx, (%ebp)
-; FALLBACK21-NEXT: movl %edx, 4(%ebp)
-; FALLBACK21-NEXT: addl $188, %esp
-; FALLBACK21-NEXT: popl %esi
-; FALLBACK21-NEXT: popl %edi
-; FALLBACK21-NEXT: popl %ebx
-; FALLBACK21-NEXT: popl %ebp
-; FALLBACK21-NEXT: retl
-;
-; FALLBACK22-LABEL: shl_64bytes:
-; FALLBACK22: # %bb.0:
-; FALLBACK22-NEXT: pushl %ebp
-; FALLBACK22-NEXT: pushl %ebx
-; FALLBACK22-NEXT: pushl %edi
-; FALLBACK22-NEXT: pushl %esi
-; FALLBACK22-NEXT: subl $204, %esp
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK22-NEXT: movups (%ecx), %xmm0
-; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK22-NEXT: movups 48(%ecx), %xmm3
-; FALLBACK22-NEXT: movl (%eax), %eax
-; FALLBACK22-NEXT: xorps %xmm4, %xmm4
-; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: leal (,%eax,8), %edx
-; FALLBACK22-NEXT: andl $24, %edx
-; FALLBACK22-NEXT: andl $60, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi
-; FALLBACK22-NEXT: subl %eax, %edi
-; FALLBACK22-NEXT: movl (%edi), %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 4(%edi), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl %edx, %ebx
-; FALLBACK22-NEXT: notb %bl
-; FALLBACK22-NEXT: shrl %ecx
-; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi
-; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx
-; FALLBACK22-NEXT: orl %ecx, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 8(%edi), %esi
-; FALLBACK22-NEXT: movl %esi, %ecx
-; FALLBACK22-NEXT: shrl %ecx
-; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK22-NEXT: movl 12(%edi), %ecx
-; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %edx, %esi, %esi
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: shrl %eax
-; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT: orl %esi, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 16(%edi), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrl %eax
-; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT: movl 20(%edi), %esi
-; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: shrl %ecx
-; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT: orl %eax, %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 24(%edi), %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrl %ecx
-; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK22-NEXT: movl 28(%edi), %ecx
-; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: shrl %esi
-; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT: orl %eax, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 32(%edi), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrl %eax
-; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT: movl 36(%edi), %esi
-; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: shrl %ecx
-; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT: orl %eax, %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 40(%edi), %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrl %ecx
-; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK22-NEXT: movl 44(%edi), %ecx
-; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: shrl %esi
-; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT: orl %eax, %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 48(%edi), %esi
-; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrl %esi
-; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax
-; FALLBACK22-NEXT: movl 52(%edi), %esi
-; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: shrl %ecx
-; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp
-; FALLBACK22-NEXT: orl %eax, %ebp
-; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK22-NEXT: negl %eax
-; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK22-NEXT: movl 56(%edi), %eax
-; FALLBACK22-NEXT: shlxl %edx, %eax, %edx
-; FALLBACK22-NEXT: shrl %esi
-; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK22-NEXT: orl %edx, %esi
-; FALLBACK22-NEXT: shrl %eax
-; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK22-NEXT: orl %eax, %ecx
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK22-NEXT: movl %edx, (%eax)
-; FALLBACK22-NEXT: movl %esi, 56(%eax)
-; FALLBACK22-NEXT: movl %ecx, 60(%eax)
-; FALLBACK22-NEXT: movl %ebp, 48(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 52(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 40(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 44(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 32(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 36(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 24(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 28(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 16(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 20(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 8(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 12(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 4(%eax)
-; FALLBACK22-NEXT: addl $204, %esp
-; FALLBACK22-NEXT: popl %esi
-; FALLBACK22-NEXT: popl %edi
-; FALLBACK22-NEXT: popl %ebx
-; FALLBACK22-NEXT: popl %ebp
-; FALLBACK22-NEXT: retl
-;
-; FALLBACK23-LABEL: shl_64bytes:
-; FALLBACK23: # %bb.0:
-; FALLBACK23-NEXT: pushl %ebp
-; FALLBACK23-NEXT: pushl %ebx
-; FALLBACK23-NEXT: pushl %edi
-; FALLBACK23-NEXT: pushl %esi
-; FALLBACK23-NEXT: subl $204, %esp
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK23-NEXT: movups (%ecx), %xmm0
-; FALLBACK23-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK23-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK23-NEXT: movups 48(%ecx), %xmm3
-; FALLBACK23-NEXT: movl (%eax), %ebp
-; FALLBACK23-NEXT: xorps %xmm4, %xmm4
-; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: leal (,%ebp,8), %ecx
-; FALLBACK23-NEXT: andl $24, %ecx
-; FALLBACK23-NEXT: andl $60, %ebp
-; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: subl %ebp, %eax
-; FALLBACK23-NEXT: movl 4(%eax), %esi
-; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 8(%eax), %edi
-; FALLBACK23-NEXT: movl 12(%eax), %edx
-; FALLBACK23-NEXT: movl %edx, %ebx
-; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shldl %cl, %esi, %edi
-; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 16(%eax), %edi
-; FALLBACK23-NEXT: movl 20(%eax), %esi
-; FALLBACK23-NEXT: movl %esi, %ebx
-; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shldl %cl, %edx, %edi
-; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 24(%eax), %edi
-; FALLBACK23-NEXT: movl 28(%eax), %edx
-; FALLBACK23-NEXT: movl %edx, %ebx
-; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shldl %cl, %esi, %edi
-; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 32(%eax), %edi
-; FALLBACK23-NEXT: movl 36(%eax), %esi
-; FALLBACK23-NEXT: movl %esi, %ebx
-; FALLBACK23-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shldl %cl, %edx, %edi
-; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 40(%eax), %ebx
-; FALLBACK23-NEXT: movl 44(%eax), %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shldl %cl, %ebx, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shldl %cl, %esi, %ebx
-; FALLBACK23-NEXT: movl 56(%eax), %edx
-; FALLBACK23-NEXT: movl 60(%eax), %edi
-; FALLBACK23-NEXT: shldl %cl, %edx, %edi
-; FALLBACK23-NEXT: movl (%eax), %esi
-; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 52(%eax), %esi
-; FALLBACK23-NEXT: shldl %cl, %esi, %edx
-; FALLBACK23-NEXT: negl %ebp
-; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movl %edx, 56(%eax)
-; FALLBACK23-NEXT: movl %edi, 60(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK23-NEXT: shlxl %ecx, %edx, %edi
-; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK23-NEXT: shldl %cl, %edx, %edi
-; FALLBACK23-NEXT: shldl %cl, %ebp, %esi
-; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK23-NEXT: shldl %cl, %edx, %ebp
-; FALLBACK23-NEXT: movl %ebp, 48(%eax)
-; FALLBACK23-NEXT: movl %esi, 52(%eax)
-; FALLBACK23-NEXT: movl %ebx, 40(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 44(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 32(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 36(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 24(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 28(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 16(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 20(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 8(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 12(%eax)
-; FALLBACK23-NEXT: movl %edi, 4(%eax)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, (%eax)
-; FALLBACK23-NEXT: addl $204, %esp
-; FALLBACK23-NEXT: popl %esi
-; FALLBACK23-NEXT: popl %edi
-; FALLBACK23-NEXT: popl %ebx
-; FALLBACK23-NEXT: popl %ebp
-; FALLBACK23-NEXT: retl
-;
-; FALLBACK24-LABEL: shl_64bytes:
-; FALLBACK24: # %bb.0:
-; FALLBACK24-NEXT: pushl %ebp
-; FALLBACK24-NEXT: pushl %ebx
-; FALLBACK24-NEXT: pushl %edi
-; FALLBACK24-NEXT: pushl %esi
-; FALLBACK24-NEXT: subl $204, %esp
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1
-; FALLBACK24-NEXT: movl (%eax), %eax
-; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %eax, %edx
-; FALLBACK24-NEXT: andl $60, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: subl %edx, %ecx
-; FALLBACK24-NEXT: movl (%ecx), %edi
-; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 4(%ecx), %edx
-; FALLBACK24-NEXT: movl %ecx, %ebp
-; FALLBACK24-NEXT: shll $3, %eax
-; FALLBACK24-NEXT: andl $24, %eax
-; FALLBACK24-NEXT: movl %edx, %esi
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: shrl %edi
-; FALLBACK24-NEXT: movb %al, %ch
-; FALLBACK24-NEXT: notb %ch
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: orl %esi, %edi
-; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 12(%ebp), %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: movl 8(%ebp), %esi
-; FALLBACK24-NEXT: movl %ebp, %edi
-; FALLBACK24-NEXT: movl %esi, %ebp
-; FALLBACK24-NEXT: shrl %ebp
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: orl %ebx, %ebp
-; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: shrl %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edx
-; FALLBACK24-NEXT: orl %esi, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl %edi, %ebp
-; FALLBACK24-NEXT: movl 20(%edi), %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: movl 16(%edi), %esi
-; FALLBACK24-NEXT: movl %esi, %edx
-; FALLBACK24-NEXT: shrl %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edx
-; FALLBACK24-NEXT: orl %ebx, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK24-NEXT: shrl %edi
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: orl %esi, %edi
-; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl %ebp, %edx
-; FALLBACK24-NEXT: movl 28(%ebp), %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: movl 24(%ebp), %esi
-; FALLBACK24-NEXT: movl %esi, %edi
-; FALLBACK24-NEXT: shrl %edi
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: orl %ebx, %edi
-; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK24-NEXT: shrl %ebp
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: orl %esi, %ebp
-; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 36(%edx), %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: movl 32(%edx), %esi
-; FALLBACK24-NEXT: movl %edx, %ebp
-; FALLBACK24-NEXT: movl %esi, %edi
-; FALLBACK24-NEXT: shrl %edi
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: orl %ebx, %edi
-; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: shrl %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edx
-; FALLBACK24-NEXT: orl %esi, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 44(%ebp), %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: movl 40(%ebp), %esi
-; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl %esi, %edx
-; FALLBACK24-NEXT: shrl %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edx
-; FALLBACK24-NEXT: orl %ebx, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: shrl %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edx
-; FALLBACK24-NEXT: orl %esi, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 52(%ebp), %esi
-; FALLBACK24-NEXT: movl %esi, %edi
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %edi
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: negl %edx
-; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx
-; FALLBACK24-NEXT: movl %ebx, %ebp
-; FALLBACK24-NEXT: shrl %ebp
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: orl %edi, %ebp
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: shrl %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edx
-; FALLBACK24-NEXT: orl %ebx, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK24-NEXT: movl 60(%edi), %edx
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %edx
-; FALLBACK24-NEXT: movl 56(%edi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %edi
-; FALLBACK24-NEXT: shrl %edi
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: orl %edx, %edi
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: shrl %esi
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shrl %cl, %esi
-; FALLBACK24-NEXT: orl %ebx, %esi
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: shll %cl, %edx
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl %edx, (%eax)
-; FALLBACK24-NEXT: movl %esi, 56(%eax)
-; FALLBACK24-NEXT: movl %edi, 60(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 48(%eax)
-; FALLBACK24-NEXT: movl %ebp, 52(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 40(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 44(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 32(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 36(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 24(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 28(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 16(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 20(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 8(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 12(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 4(%eax)
-; FALLBACK24-NEXT: addl $204, %esp
-; FALLBACK24-NEXT: popl %esi
-; FALLBACK24-NEXT: popl %edi
-; FALLBACK24-NEXT: popl %ebx
-; FALLBACK24-NEXT: popl %ebp
-; FALLBACK24-NEXT: vzeroupper
-; FALLBACK24-NEXT: retl
-;
-; FALLBACK25-LABEL: shl_64bytes:
-; FALLBACK25: # %bb.0:
-; FALLBACK25-NEXT: pushl %ebp
-; FALLBACK25-NEXT: pushl %ebx
-; FALLBACK25-NEXT: pushl %edi
-; FALLBACK25-NEXT: pushl %esi
-; FALLBACK25-NEXT: subl $188, %esp
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK25-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1
-; FALLBACK25-NEXT: movl (%eax), %ecx
-; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %ecx, %ebp
-; FALLBACK25-NEXT: andl $60, %ebp
-; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK25-NEXT: subl %ebp, %eax
-; FALLBACK25-NEXT: movl 8(%eax), %esi
-; FALLBACK25-NEXT: movl 12(%eax), %edx
-; FALLBACK25-NEXT: shll $3, %ecx
-; FALLBACK25-NEXT: andl $24, %ecx
-; FALLBACK25-NEXT: movl %edx, %edi
-; FALLBACK25-NEXT: shldl %cl, %esi, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 4(%eax), %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shldl %cl, %edi, %esi
-; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 16(%eax), %edi
-; FALLBACK25-NEXT: movl 20(%eax), %esi
-; FALLBACK25-NEXT: movl %esi, %ebx
-; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shldl %cl, %edx, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 24(%eax), %edi
-; FALLBACK25-NEXT: movl 28(%eax), %edx
-; FALLBACK25-NEXT: movl %edx, %ebx
-; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shldl %cl, %esi, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 32(%eax), %edi
-; FALLBACK25-NEXT: movl 36(%eax), %esi
-; FALLBACK25-NEXT: movl %esi, %ebx
-; FALLBACK25-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shldl %cl, %edx, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 40(%eax), %edx
-; FALLBACK25-NEXT: movl 44(%eax), %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shldl %cl, %edx, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shldl %cl, %esi, %edx
-; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK25-NEXT: movl 56(%eax), %edx
-; FALLBACK25-NEXT: movl 60(%eax), %edi
-; FALLBACK25-NEXT: shldl %cl, %edx, %edi
-; FALLBACK25-NEXT: movl (%eax), %ebx
-; FALLBACK25-NEXT: movl 52(%eax), %esi
-; FALLBACK25-NEXT: shldl %cl, %esi, %edx
-; FALLBACK25-NEXT: negl %ebp
-; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK25-NEXT: movl %edx, 56(%ebp)
-; FALLBACK25-NEXT: movl %edi, 60(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK25-NEXT: shldl %cl, %ebx, %edx
-; FALLBACK25-NEXT: shll %cl, %ebx
-; FALLBACK25-NEXT: shldl %cl, %eax, %esi
-; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK25-NEXT: shldl %cl, %edi, %eax
-; FALLBACK25-NEXT: movl %eax, 48(%ebp)
-; FALLBACK25-NEXT: movl %esi, 52(%ebp)
-; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 40(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 44(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 32(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 36(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 24(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 28(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 16(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 20(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 8(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 12(%ebp)
-; FALLBACK25-NEXT: movl %ebx, (%ebp)
-; FALLBACK25-NEXT: movl %edx, 4(%ebp)
-; FALLBACK25-NEXT: addl $188, %esp
-; FALLBACK25-NEXT: popl %esi
-; FALLBACK25-NEXT: popl %edi
-; FALLBACK25-NEXT: popl %ebx
-; FALLBACK25-NEXT: popl %ebp
-; FALLBACK25-NEXT: vzeroupper
-; FALLBACK25-NEXT: retl
-;
-; FALLBACK26-LABEL: shl_64bytes:
-; FALLBACK26: # %bb.0:
-; FALLBACK26-NEXT: pushl %ebp
-; FALLBACK26-NEXT: pushl %ebx
-; FALLBACK26-NEXT: pushl %edi
-; FALLBACK26-NEXT: pushl %esi
-; FALLBACK26-NEXT: subl $204, %esp
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1
-; FALLBACK26-NEXT: movl (%eax), %eax
-; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: leal (,%eax,8), %edx
-; FALLBACK26-NEXT: andl $24, %edx
-; FALLBACK26-NEXT: andl $60, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi
-; FALLBACK26-NEXT: subl %eax, %edi
-; FALLBACK26-NEXT: movl (%edi), %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 4(%edi), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl %edx, %ebx
-; FALLBACK26-NEXT: notb %bl
-; FALLBACK26-NEXT: shrl %ecx
-; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi
-; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx
-; FALLBACK26-NEXT: orl %ecx, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 8(%edi), %esi
-; FALLBACK26-NEXT: movl %esi, %ecx
-; FALLBACK26-NEXT: shrl %ecx
-; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK26-NEXT: movl 12(%edi), %ecx
-; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %edx, %esi, %esi
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: shrl %eax
-; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT: orl %esi, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 16(%edi), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrl %eax
-; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT: movl 20(%edi), %esi
-; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: shrl %ecx
-; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT: orl %eax, %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 24(%edi), %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrl %ecx
-; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK26-NEXT: movl 28(%edi), %ecx
-; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: shrl %esi
-; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT: orl %eax, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 32(%edi), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrl %eax
-; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT: movl 36(%edi), %esi
-; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: shrl %ecx
-; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT: orl %eax, %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 40(%edi), %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrl %ecx
-; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK26-NEXT: movl 44(%edi), %ecx
-; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: shrl %esi
-; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT: orl %eax, %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 48(%edi), %esi
-; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrl %esi
-; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax
-; FALLBACK26-NEXT: movl 52(%edi), %esi
-; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: shrl %ecx
-; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp
-; FALLBACK26-NEXT: orl %eax, %ebp
-; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK26-NEXT: negl %eax
-; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK26-NEXT: movl 56(%edi), %eax
-; FALLBACK26-NEXT: shlxl %edx, %eax, %edx
-; FALLBACK26-NEXT: shrl %esi
-; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK26-NEXT: orl %edx, %esi
-; FALLBACK26-NEXT: shrl %eax
-; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK26-NEXT: orl %eax, %ecx
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK26-NEXT: movl %edx, (%eax)
-; FALLBACK26-NEXT: movl %esi, 56(%eax)
-; FALLBACK26-NEXT: movl %ecx, 60(%eax)
-; FALLBACK26-NEXT: movl %ebp, 48(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 52(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 40(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 44(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 32(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 36(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 24(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 28(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 16(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 20(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 8(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 12(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 4(%eax)
-; FALLBACK26-NEXT: addl $204, %esp
-; FALLBACK26-NEXT: popl %esi
-; FALLBACK26-NEXT: popl %edi
-; FALLBACK26-NEXT: popl %ebx
-; FALLBACK26-NEXT: popl %ebp
-; FALLBACK26-NEXT: vzeroupper
-; FALLBACK26-NEXT: retl
-;
-; FALLBACK27-LABEL: shl_64bytes:
-; FALLBACK27: # %bb.0:
-; FALLBACK27-NEXT: pushl %ebp
-; FALLBACK27-NEXT: pushl %ebx
-; FALLBACK27-NEXT: pushl %edi
-; FALLBACK27-NEXT: pushl %esi
-; FALLBACK27-NEXT: subl $204, %esp
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK27-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1
-; FALLBACK27-NEXT: movl (%eax), %ebx
-; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: leal (,%ebx,8), %ecx
-; FALLBACK27-NEXT: andl $24, %ecx
-; FALLBACK27-NEXT: andl $60, %ebx
-; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: subl %ebx, %eax
-; FALLBACK27-NEXT: movl 4(%eax), %esi
-; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 8(%eax), %edi
-; FALLBACK27-NEXT: movl 12(%eax), %edx
-; FALLBACK27-NEXT: movl %edx, %ebp
-; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shldl %cl, %esi, %edi
-; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 16(%eax), %edi
-; FALLBACK27-NEXT: movl 20(%eax), %esi
-; FALLBACK27-NEXT: movl %esi, %ebp
-; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shldl %cl, %edx, %edi
-; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 24(%eax), %edi
-; FALLBACK27-NEXT: movl 28(%eax), %edx
-; FALLBACK27-NEXT: movl %edx, %ebp
-; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shldl %cl, %esi, %edi
-; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 32(%eax), %edi
-; FALLBACK27-NEXT: movl 36(%eax), %esi
-; FALLBACK27-NEXT: movl %esi, %ebp
-; FALLBACK27-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shldl %cl, %edx, %edi
-; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 40(%eax), %ebp
-; FALLBACK27-NEXT: movl 44(%eax), %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shldl %cl, %ebp, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shldl %cl, %esi, %ebp
-; FALLBACK27-NEXT: movl 56(%eax), %edx
-; FALLBACK27-NEXT: movl 60(%eax), %edi
-; FALLBACK27-NEXT: shldl %cl, %edx, %edi
-; FALLBACK27-NEXT: movl (%eax), %esi
-; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 52(%eax), %esi
-; FALLBACK27-NEXT: shldl %cl, %esi, %edx
-; FALLBACK27-NEXT: negl %ebx
-; FALLBACK27-NEXT: movl 176(%esp,%ebx), %ebx
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: movl %edx, 56(%eax)
-; FALLBACK27-NEXT: movl %edi, 60(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK27-NEXT: shlxl %ecx, %edx, %edi
-; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK27-NEXT: shldl %cl, %edx, %edi
-; FALLBACK27-NEXT: shldl %cl, %ebx, %esi
-; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK27-NEXT: shldl %cl, %edx, %ebx
-; FALLBACK27-NEXT: movl %ebx, 48(%eax)
-; FALLBACK27-NEXT: movl %esi, 52(%eax)
-; FALLBACK27-NEXT: movl %ebp, 40(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 44(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 32(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 36(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 24(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 28(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 16(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 20(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 8(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 12(%eax)
-; FALLBACK27-NEXT: movl %edi, 4(%eax)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, (%eax)
-; FALLBACK27-NEXT: addl $204, %esp
-; FALLBACK27-NEXT: popl %esi
-; FALLBACK27-NEXT: popl %edi
-; FALLBACK27-NEXT: popl %ebx
-; FALLBACK27-NEXT: popl %ebp
-; FALLBACK27-NEXT: vzeroupper
-; FALLBACK27-NEXT: retl
-;
-; FALLBACK28-LABEL: shl_64bytes:
-; FALLBACK28: # %bb.0:
-; FALLBACK28-NEXT: pushl %ebp
-; FALLBACK28-NEXT: pushl %ebx
-; FALLBACK28-NEXT: pushl %edi
-; FALLBACK28-NEXT: pushl %esi
-; FALLBACK28-NEXT: subl $204, %esp
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: vmovups (%ecx), %zmm0
-; FALLBACK28-NEXT: movl (%eax), %eax
-; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %eax, %edx
-; FALLBACK28-NEXT: andl $60, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: subl %edx, %ecx
-; FALLBACK28-NEXT: movl (%ecx), %edi
-; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 4(%ecx), %edx
-; FALLBACK28-NEXT: movl %ecx, %ebp
-; FALLBACK28-NEXT: shll $3, %eax
-; FALLBACK28-NEXT: andl $24, %eax
-; FALLBACK28-NEXT: movl %edx, %esi
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: shrl %edi
-; FALLBACK28-NEXT: movb %al, %ch
-; FALLBACK28-NEXT: notb %ch
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: orl %esi, %edi
-; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 12(%ebp), %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: movl 8(%ebp), %esi
-; FALLBACK28-NEXT: movl %ebp, %edi
-; FALLBACK28-NEXT: movl %esi, %ebp
-; FALLBACK28-NEXT: shrl %ebp
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: orl %ebx, %ebp
-; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: shrl %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edx
-; FALLBACK28-NEXT: orl %esi, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl %edi, %ebp
-; FALLBACK28-NEXT: movl 20(%edi), %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: movl 16(%edi), %esi
-; FALLBACK28-NEXT: movl %esi, %edx
-; FALLBACK28-NEXT: shrl %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edx
-; FALLBACK28-NEXT: orl %ebx, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK28-NEXT: shrl %edi
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: orl %esi, %edi
-; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl %ebp, %edx
-; FALLBACK28-NEXT: movl 28(%ebp), %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: movl 24(%ebp), %esi
-; FALLBACK28-NEXT: movl %esi, %edi
-; FALLBACK28-NEXT: shrl %edi
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: orl %ebx, %edi
-; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK28-NEXT: shrl %ebp
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: orl %esi, %ebp
-; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 36(%edx), %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: movl 32(%edx), %esi
-; FALLBACK28-NEXT: movl %edx, %ebp
-; FALLBACK28-NEXT: movl %esi, %edi
-; FALLBACK28-NEXT: shrl %edi
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: orl %ebx, %edi
-; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: shrl %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edx
-; FALLBACK28-NEXT: orl %esi, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 44(%ebp), %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: movl 40(%ebp), %esi
-; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl %esi, %edx
-; FALLBACK28-NEXT: shrl %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edx
-; FALLBACK28-NEXT: orl %ebx, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: shrl %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edx
-; FALLBACK28-NEXT: orl %esi, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 52(%ebp), %esi
-; FALLBACK28-NEXT: movl %esi, %edi
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %edi
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: negl %edx
-; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx
-; FALLBACK28-NEXT: movl %ebx, %ebp
-; FALLBACK28-NEXT: shrl %ebp
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: orl %edi, %ebp
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: shrl %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edx
-; FALLBACK28-NEXT: orl %ebx, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK28-NEXT: movl 60(%edi), %edx
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %edx
-; FALLBACK28-NEXT: movl 56(%edi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %edi
-; FALLBACK28-NEXT: shrl %edi
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: orl %edx, %edi
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: shrl %esi
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shrl %cl, %esi
-; FALLBACK28-NEXT: orl %ebx, %esi
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: shll %cl, %edx
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl %edx, (%eax)
-; FALLBACK28-NEXT: movl %esi, 56(%eax)
-; FALLBACK28-NEXT: movl %edi, 60(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 48(%eax)
-; FALLBACK28-NEXT: movl %ebp, 52(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 40(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 44(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 32(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 36(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 24(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 28(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 16(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 20(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 8(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 12(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 4(%eax)
-; FALLBACK28-NEXT: addl $204, %esp
-; FALLBACK28-NEXT: popl %esi
-; FALLBACK28-NEXT: popl %edi
-; FALLBACK28-NEXT: popl %ebx
-; FALLBACK28-NEXT: popl %ebp
-; FALLBACK28-NEXT: vzeroupper
-; FALLBACK28-NEXT: retl
-;
-; FALLBACK29-LABEL: shl_64bytes:
-; FALLBACK29: # %bb.0:
-; FALLBACK29-NEXT: pushl %ebp
-; FALLBACK29-NEXT: pushl %ebx
-; FALLBACK29-NEXT: pushl %edi
-; FALLBACK29-NEXT: pushl %esi
-; FALLBACK29-NEXT: subl $188, %esp
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK29-NEXT: vmovups (%ecx), %zmm0
-; FALLBACK29-NEXT: movl (%eax), %ecx
-; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %ecx, %ebp
-; FALLBACK29-NEXT: andl $60, %ebp
-; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK29-NEXT: subl %ebp, %eax
-; FALLBACK29-NEXT: movl 8(%eax), %esi
-; FALLBACK29-NEXT: movl 12(%eax), %edx
-; FALLBACK29-NEXT: shll $3, %ecx
-; FALLBACK29-NEXT: andl $24, %ecx
-; FALLBACK29-NEXT: movl %edx, %edi
-; FALLBACK29-NEXT: shldl %cl, %esi, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 4(%eax), %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shldl %cl, %edi, %esi
-; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 16(%eax), %edi
-; FALLBACK29-NEXT: movl 20(%eax), %esi
-; FALLBACK29-NEXT: movl %esi, %ebx
-; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shldl %cl, %edx, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 24(%eax), %edi
-; FALLBACK29-NEXT: movl 28(%eax), %edx
-; FALLBACK29-NEXT: movl %edx, %ebx
-; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shldl %cl, %esi, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 32(%eax), %edi
-; FALLBACK29-NEXT: movl 36(%eax), %esi
-; FALLBACK29-NEXT: movl %esi, %ebx
-; FALLBACK29-NEXT: shldl %cl, %edi, %ebx
-; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shldl %cl, %edx, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 40(%eax), %edx
-; FALLBACK29-NEXT: movl 44(%eax), %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shldl %cl, %edx, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shldl %cl, %esi, %edx
-; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK29-NEXT: movl 56(%eax), %edx
-; FALLBACK29-NEXT: movl 60(%eax), %edi
-; FALLBACK29-NEXT: shldl %cl, %edx, %edi
-; FALLBACK29-NEXT: movl (%eax), %ebx
-; FALLBACK29-NEXT: movl 52(%eax), %esi
-; FALLBACK29-NEXT: shldl %cl, %esi, %edx
-; FALLBACK29-NEXT: negl %ebp
-; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK29-NEXT: movl %edx, 56(%ebp)
-; FALLBACK29-NEXT: movl %edi, 60(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK29-NEXT: shldl %cl, %ebx, %edx
-; FALLBACK29-NEXT: shll %cl, %ebx
-; FALLBACK29-NEXT: shldl %cl, %eax, %esi
-; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK29-NEXT: shldl %cl, %edi, %eax
-; FALLBACK29-NEXT: movl %eax, 48(%ebp)
-; FALLBACK29-NEXT: movl %esi, 52(%ebp)
-; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 40(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 44(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 32(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 36(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 24(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 28(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 16(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 20(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 8(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 12(%ebp)
-; FALLBACK29-NEXT: movl %ebx, (%ebp)
-; FALLBACK29-NEXT: movl %edx, 4(%ebp)
-; FALLBACK29-NEXT: addl $188, %esp
-; FALLBACK29-NEXT: popl %esi
-; FALLBACK29-NEXT: popl %edi
-; FALLBACK29-NEXT: popl %ebx
-; FALLBACK29-NEXT: popl %ebp
-; FALLBACK29-NEXT: vzeroupper
-; FALLBACK29-NEXT: retl
-;
-; FALLBACK30-LABEL: shl_64bytes:
-; FALLBACK30: # %bb.0:
-; FALLBACK30-NEXT: pushl %ebp
-; FALLBACK30-NEXT: pushl %ebx
-; FALLBACK30-NEXT: pushl %edi
-; FALLBACK30-NEXT: pushl %esi
-; FALLBACK30-NEXT: subl $204, %esp
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT: vmovups (%ecx), %zmm0
-; FALLBACK30-NEXT: movl (%eax), %eax
-; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: leal (,%eax,8), %edx
-; FALLBACK30-NEXT: andl $24, %edx
-; FALLBACK30-NEXT: andl $60, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi
-; FALLBACK30-NEXT: subl %eax, %edi
-; FALLBACK30-NEXT: movl (%edi), %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 4(%edi), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl %edx, %ebx
-; FALLBACK30-NEXT: notb %bl
-; FALLBACK30-NEXT: shrl %ecx
-; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi
-; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx
-; FALLBACK30-NEXT: orl %ecx, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 8(%edi), %esi
-; FALLBACK30-NEXT: movl %esi, %ecx
-; FALLBACK30-NEXT: shrl %ecx
-; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK30-NEXT: movl 12(%edi), %ecx
-; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %edx, %esi, %esi
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: shrl %eax
-; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT: orl %esi, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 16(%edi), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrl %eax
-; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT: movl 20(%edi), %esi
-; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: shrl %ecx
-; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT: orl %eax, %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 24(%edi), %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrl %ecx
-; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK30-NEXT: movl 28(%edi), %ecx
-; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: shrl %esi
-; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT: orl %eax, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 32(%edi), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrl %eax
-; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT: movl 36(%edi), %esi
-; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: shrl %ecx
-; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT: orl %eax, %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 40(%edi), %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrl %ecx
-; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax
-; FALLBACK30-NEXT: movl 44(%edi), %ecx
-; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: shrl %esi
-; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT: orl %eax, %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 48(%edi), %esi
-; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrl %esi
-; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax
-; FALLBACK30-NEXT: movl 52(%edi), %esi
-; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: shrl %ecx
-; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp
-; FALLBACK30-NEXT: orl %eax, %ebp
-; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK30-NEXT: negl %eax
-; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx
-; FALLBACK30-NEXT: movl 56(%edi), %eax
-; FALLBACK30-NEXT: shlxl %edx, %eax, %edx
-; FALLBACK30-NEXT: shrl %esi
-; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi
-; FALLBACK30-NEXT: orl %edx, %esi
-; FALLBACK30-NEXT: shrl %eax
-; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax
-; FALLBACK30-NEXT: orl %eax, %ecx
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK30-NEXT: movl %edx, (%eax)
-; FALLBACK30-NEXT: movl %esi, 56(%eax)
-; FALLBACK30-NEXT: movl %ecx, 60(%eax)
-; FALLBACK30-NEXT: movl %ebp, 48(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 52(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 40(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 44(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 32(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 36(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 24(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 28(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 16(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 20(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 8(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 12(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 4(%eax)
-; FALLBACK30-NEXT: addl $204, %esp
-; FALLBACK30-NEXT: popl %esi
-; FALLBACK30-NEXT: popl %edi
-; FALLBACK30-NEXT: popl %ebx
-; FALLBACK30-NEXT: popl %ebp
-; FALLBACK30-NEXT: vzeroupper
-; FALLBACK30-NEXT: retl
-;
-; FALLBACK31-LABEL: shl_64bytes:
-; FALLBACK31: # %bb.0:
-; FALLBACK31-NEXT: pushl %ebp
-; FALLBACK31-NEXT: pushl %ebx
-; FALLBACK31-NEXT: pushl %edi
-; FALLBACK31-NEXT: pushl %esi
-; FALLBACK31-NEXT: subl $204, %esp
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK31-NEXT: vmovups (%ecx), %zmm0
-; FALLBACK31-NEXT: movl (%eax), %ebx
-; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: leal (,%ebx,8), %ecx
-; FALLBACK31-NEXT: andl $24, %ecx
-; FALLBACK31-NEXT: andl $60, %ebx
-; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: subl %ebx, %eax
-; FALLBACK31-NEXT: movl 4(%eax), %esi
-; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 8(%eax), %edi
-; FALLBACK31-NEXT: movl 12(%eax), %edx
-; FALLBACK31-NEXT: movl %edx, %ebp
-; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shldl %cl, %esi, %edi
-; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 16(%eax), %edi
-; FALLBACK31-NEXT: movl 20(%eax), %esi
-; FALLBACK31-NEXT: movl %esi, %ebp
-; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shldl %cl, %edx, %edi
-; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 24(%eax), %edi
-; FALLBACK31-NEXT: movl 28(%eax), %edx
-; FALLBACK31-NEXT: movl %edx, %ebp
-; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shldl %cl, %esi, %edi
-; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 32(%eax), %edi
-; FALLBACK31-NEXT: movl 36(%eax), %esi
-; FALLBACK31-NEXT: movl %esi, %ebp
-; FALLBACK31-NEXT: shldl %cl, %edi, %ebp
-; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shldl %cl, %edx, %edi
-; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 40(%eax), %ebp
-; FALLBACK31-NEXT: movl 44(%eax), %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shldl %cl, %ebp, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shldl %cl, %esi, %ebp
-; FALLBACK31-NEXT: movl 56(%eax), %edx
-; FALLBACK31-NEXT: movl 60(%eax), %edi
-; FALLBACK31-NEXT: shldl %cl, %edx, %edi
-; FALLBACK31-NEXT: movl (%eax), %esi
-; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 52(%eax), %esi
-; FALLBACK31-NEXT: shldl %cl, %esi, %edx
-; FALLBACK31-NEXT: negl %ebx
-; FALLBACK31-NEXT: movl 176(%esp,%ebx), %ebx
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: movl %edx, 56(%eax)
-; FALLBACK31-NEXT: movl %edi, 60(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK31-NEXT: shlxl %ecx, %edx, %edi
-; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK31-NEXT: shldl %cl, %edx, %edi
-; FALLBACK31-NEXT: shldl %cl, %ebx, %esi
-; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK31-NEXT: shldl %cl, %edx, %ebx
-; FALLBACK31-NEXT: movl %ebx, 48(%eax)
-; FALLBACK31-NEXT: movl %esi, 52(%eax)
-; FALLBACK31-NEXT: movl %ebp, 40(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 44(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 32(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 36(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 24(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 28(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 16(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 20(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 8(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 12(%eax)
-; FALLBACK31-NEXT: movl %edi, 4(%eax)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, (%eax)
-; FALLBACK31-NEXT: addl $204, %esp
-; FALLBACK31-NEXT: popl %esi
-; FALLBACK31-NEXT: popl %edi
-; FALLBACK31-NEXT: popl %ebx
-; FALLBACK31-NEXT: popl %ebp
-; FALLBACK31-NEXT: vzeroupper
-; FALLBACK31-NEXT: retl
+; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
+; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: negl %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movslq %esi, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%rbx), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%rbx), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%rbx), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%rbx), %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%rbx), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%rbx), %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r13, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%rbx), %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%rbx), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r13, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negl %esi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movslq %esi, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -48(%rsp,%r9), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -40(%rsp,%r9), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -64(%rsp,%r9), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -56(%rsp,%r9), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -32(%rsp,%r9), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -24(%rsp,%r9), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r11, %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r10, %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -16(%rsp,%r9), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -8(%rsp,%r9), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r10, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %rbx, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldq %cl, %r8, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negl %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movslq %esi, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r8, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rbx, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r15, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r12, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%rsi), %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rsi, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rsi, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r14, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rax, %rbx, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rcx, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rsi,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negl %esi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movslq %esi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -48(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -40(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -64(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -56(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -32(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -24(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r11, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r9, %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -16(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %rbx, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes:
+; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rcx,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: negl %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movslq %ecx, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r11, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r9), %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r15, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negl %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movslq %eax, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %rbx, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r9, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r14, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldq %cl, %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negl %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movslq %esi, %rsi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r9, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r10, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r11, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r12, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rax, %r15, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rcx, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%rdi), %xmm3
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negl %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movslq %eax, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -32(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %rbx, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r9, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r14, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldq %cl, %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes:
+; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rcx,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: negl %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movslq %ecx, %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -24(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -40(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r11, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -48(%rsp,%r9), %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -64(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -56(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r15, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -16(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r9, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: negl %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movslq %eax, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -32(%rsp,%r8), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %rbx, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r9, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r14, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldq %cl, %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: negl %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movslq %esi, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -24(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rdi, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -32(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r8, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r9, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -40(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r9, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -48(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r10, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r11, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -64(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -56(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rbx, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -16(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r15, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r12, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rax, %r15, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rcx, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %ymm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: negl %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movslq %eax, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -32(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %rbx, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r9, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r14, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldq %cl, %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movslq %ecx, %r9
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r9), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r9), %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r9), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r11, %r10
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r9), %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r9), %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r15, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r9), %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %r13
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r9), %r9
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r9, %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movslq %eax, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %esi, %rsi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%rsi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rdi, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%rsi), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r8, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r9, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%rsi), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r9, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%rsi), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r10, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r11, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%rsi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%rsi), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rbx, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%rsi), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r15, %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r12, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r15, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rcx, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %eax, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq
+;
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%eax), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%edx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: negl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 176(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%edi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ecx), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ecx), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%ecx), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ecx), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%ecx), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%ecx), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl %ebp, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: negl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 160(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shldl %cl, %edi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ebp), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%ebp,8), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%edx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %esi, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: negl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 56(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 60(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 48(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 52(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 40(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 44(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 32(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 36(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: shl_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%ebp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: xorps %xmm0, %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%ebp,8), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: negl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 176(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 60(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %ecx, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %ebp, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shldl %cl, %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, 48(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 40(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes:
+; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 4(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 12(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 8(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%edi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%edx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: negl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 176(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%edi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: shl_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl %ebp, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 8(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 12(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 4(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 16(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 20(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 24(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 28(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 32(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 36(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 40(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 44(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: negl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 160(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shldl %cl, %edi, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%eax,8), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 4(%edx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %bl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 8(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 12(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%edx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %esi, %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %esi, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebp, %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: negl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 56(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 60(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 48(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 52(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 40(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 44(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 32(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 36(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: shl_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 48(%ecx), %xmm3
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: xorps %xmm4, %xmm4
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm4, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm3, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%ebp,8), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 4(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 8(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 12(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 16(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 20(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 24(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 28(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 32(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 36(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 40(%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 44(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: negl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 176(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 60(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %ecx, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %ebp, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shldl %cl, %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, 48(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 40(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes:
+; X86-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 4(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 12(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 8(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 20(%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 16(%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 28(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 24(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 36(%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 32(%edx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 44(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 40(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: negl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 176(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%edi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX1-LABEL: shl_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl %ebp, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 8(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 12(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 4(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 16(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 20(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 24(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 28(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 32(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 36(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 40(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 44(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: negl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 160(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shldl %cl, %edi, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%eax,8), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 4(%edx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %bl
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 8(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 12(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 16(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 20(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 24(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 28(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 32(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 36(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 40(%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 44(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %eax, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%edx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %esi, %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %esi, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ebp, %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: negl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 56(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 60(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 48(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 52(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 40(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: shl_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %ymm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%ebx,8), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 4(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 8(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 12(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 16(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 20(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 24(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 28(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 32(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 36(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 40(%eax), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 44(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %ebp, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: negl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 176(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 60(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %ecx, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shldl %cl, %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 48(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl %edx, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%edi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%edx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%ebp), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%ebp), %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 176(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%edi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl %ebp, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 160(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %ebx, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%eax,8), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl %eax, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%edx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %bl
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%edx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%edx), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%edx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ecx, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ebp, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%edx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 60(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 52(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 40(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%edx)
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ebx,8), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl %ebx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%eax), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebp, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 176(%esp,%ebx), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebx, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 48(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 44(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 32(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 36(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 24(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 28(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 16(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 20(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 8(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 12(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 4(%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, (%eax)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -20159,4099 +17857,3115 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
}
define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
-; FALLBACK0-LABEL: ashr_64bytes:
-; FALLBACK0: # %bb.0:
-; FALLBACK0-NEXT: pushq %r15
-; FALLBACK0-NEXT: pushq %r14
-; FALLBACK0-NEXT: pushq %r13
-; FALLBACK0-NEXT: pushq %r12
-; FALLBACK0-NEXT: pushq %rbx
-; FALLBACK0-NEXT: movq (%rdi), %rax
-; FALLBACK0-NEXT: movq 8(%rdi), %rcx
-; FALLBACK0-NEXT: movq 16(%rdi), %r8
-; FALLBACK0-NEXT: movq 24(%rdi), %r9
-; FALLBACK0-NEXT: movq 32(%rdi), %r10
-; FALLBACK0-NEXT: movq 40(%rdi), %r11
-; FALLBACK0-NEXT: movq 48(%rdi), %rbx
-; FALLBACK0-NEXT: movq 56(%rdi), %r14
-; FALLBACK0-NEXT: movl (%rsi), %edi
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: sarq $63, %r14
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK0-NEXT: leal (,%rdi,8), %eax
-; FALLBACK0-NEXT: andl $56, %eax
-; FALLBACK0-NEXT: andl $56, %edi
-; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10
-; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8
-; FALLBACK0-NEXT: movq %r8, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r11
-; FALLBACK0-NEXT: movl %eax, %esi
-; FALLBACK0-NEXT: notb %sil
-; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx
-; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r9
-; FALLBACK0-NEXT: orq %r11, %r9
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r10
-; FALLBACK0-NEXT: addq %r8, %r8
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r8
-; FALLBACK0-NEXT: orq %r10, %r8
-; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10
-; FALLBACK0-NEXT: movq %r10, %r15
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r15
-; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14
-; FALLBACK0-NEXT: leaq (%r14,%r14), %r11
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r11
-; FALLBACK0-NEXT: orq %r15, %r11
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %rbx
-; FALLBACK0-NEXT: addq %r10, %r10
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r10
-; FALLBACK0-NEXT: orq %rbx, %r10
-; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx
-; FALLBACK0-NEXT: movq %rbx, %r12
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r12
-; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13
-; FALLBACK0-NEXT: leaq (%r13,%r13), %r15
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r15
-; FALLBACK0-NEXT: orq %r12, %r15
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r14
-; FALLBACK0-NEXT: addq %rbx, %rbx
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %rbx
-; FALLBACK0-NEXT: orq %r14, %rbx
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: shrq %cl, %r13
-; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi
-; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14
-; FALLBACK0-NEXT: movl %esi, %ecx
-; FALLBACK0-NEXT: shlq %cl, %r14
-; FALLBACK0-NEXT: orq %r13, %r14
-; FALLBACK0-NEXT: movl %eax, %ecx
-; FALLBACK0-NEXT: sarq %cl, %rdi
-; FALLBACK0-NEXT: movq %rdi, 56(%rdx)
-; FALLBACK0-NEXT: movq %r14, 48(%rdx)
-; FALLBACK0-NEXT: movq %rbx, 32(%rdx)
-; FALLBACK0-NEXT: movq %r15, 40(%rdx)
-; FALLBACK0-NEXT: movq %r10, 16(%rdx)
-; FALLBACK0-NEXT: movq %r11, 24(%rdx)
-; FALLBACK0-NEXT: movq %r8, (%rdx)
-; FALLBACK0-NEXT: movq %r9, 8(%rdx)
-; FALLBACK0-NEXT: popq %rbx
-; FALLBACK0-NEXT: popq %r12
-; FALLBACK0-NEXT: popq %r13
-; FALLBACK0-NEXT: popq %r14
-; FALLBACK0-NEXT: popq %r15
-; FALLBACK0-NEXT: retq
-;
-; FALLBACK1-LABEL: ashr_64bytes:
-; FALLBACK1: # %bb.0:
-; FALLBACK1-NEXT: pushq %r15
-; FALLBACK1-NEXT: pushq %r14
-; FALLBACK1-NEXT: pushq %rbx
-; FALLBACK1-NEXT: movq (%rdi), %rcx
-; FALLBACK1-NEXT: movq 8(%rdi), %r8
-; FALLBACK1-NEXT: movq 16(%rdi), %r9
-; FALLBACK1-NEXT: movq 24(%rdi), %r10
-; FALLBACK1-NEXT: movq 32(%rdi), %r11
-; FALLBACK1-NEXT: movq 40(%rdi), %rbx
-; FALLBACK1-NEXT: movq 48(%rdi), %r14
-; FALLBACK1-NEXT: movq 56(%rdi), %rdi
-; FALLBACK1-NEXT: movl (%rsi), %eax
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: sarq $63, %rdi
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK1-NEXT: leal (,%rax,8), %ecx
-; FALLBACK1-NEXT: andl $56, %ecx
-; FALLBACK1-NEXT: andl $56, %eax
-; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi
-; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi
-; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9
-; FALLBACK1-NEXT: movq %r9, %r8
-; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8
-; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10
-; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11
-; FALLBACK1-NEXT: movq %r11, %rbx
-; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx
-; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11
-; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14
-; FALLBACK1-NEXT: movq %r14, %r15
-; FALLBACK1-NEXT: shrdq %cl, %r11, %r15
-; FALLBACK1-NEXT: shrdq %cl, %r14, %r10
-; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK1-NEXT: shrdq %cl, %rax, %r11
-; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi
-; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK1-NEXT: sarq %cl, %rax
-; FALLBACK1-NEXT: movq %r11, 48(%rdx)
-; FALLBACK1-NEXT: movq %rax, 56(%rdx)
-; FALLBACK1-NEXT: movq %r10, 32(%rdx)
-; FALLBACK1-NEXT: movq %r15, 40(%rdx)
-; FALLBACK1-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK1-NEXT: movq %rbx, 24(%rdx)
-; FALLBACK1-NEXT: movq %rsi, (%rdx)
-; FALLBACK1-NEXT: movq %r8, 8(%rdx)
-; FALLBACK1-NEXT: popq %rbx
-; FALLBACK1-NEXT: popq %r14
-; FALLBACK1-NEXT: popq %r15
-; FALLBACK1-NEXT: retq
-;
-; FALLBACK2-LABEL: ashr_64bytes:
-; FALLBACK2: # %bb.0:
-; FALLBACK2-NEXT: pushq %rbp
-; FALLBACK2-NEXT: pushq %r15
-; FALLBACK2-NEXT: pushq %r14
-; FALLBACK2-NEXT: pushq %r13
-; FALLBACK2-NEXT: pushq %r12
-; FALLBACK2-NEXT: pushq %rbx
-; FALLBACK2-NEXT: pushq %rax
-; FALLBACK2-NEXT: movq (%rdi), %rcx
-; FALLBACK2-NEXT: movq 8(%rdi), %r8
-; FALLBACK2-NEXT: movq 16(%rdi), %r9
-; FALLBACK2-NEXT: movq 24(%rdi), %r10
-; FALLBACK2-NEXT: movq 32(%rdi), %r11
-; FALLBACK2-NEXT: movq 40(%rdi), %rbx
-; FALLBACK2-NEXT: movq 48(%rdi), %r14
-; FALLBACK2-NEXT: movq 56(%rdi), %rdi
-; FALLBACK2-NEXT: movl (%rsi), %eax
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: sarq $63, %rdi
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK2-NEXT: leal (,%rax,8), %ecx
-; FALLBACK2-NEXT: andl $56, %ecx
-; FALLBACK2-NEXT: andl $56, %eax
-; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi
-; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9
-; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx
-; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13
-; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi
-; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8
-; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10
-; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11
-; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14
-; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15
-; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp
-; FALLBACK2-NEXT: movl %ecx, %r12d
-; FALLBACK2-NEXT: notb %r12b
-; FALLBACK2-NEXT: addq %r9, %r9
-; FALLBACK2-NEXT: shlxq %r12, %r9, %r9
-; FALLBACK2-NEXT: orq %rbx, %r9
-; FALLBACK2-NEXT: addq %rdi, %rdi
-; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi
-; FALLBACK2-NEXT: orq %r13, %rdi
-; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx
-; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13
-; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx
-; FALLBACK2-NEXT: addq %r10, %r10
-; FALLBACK2-NEXT: shlxq %r12, %r10, %r10
-; FALLBACK2-NEXT: orq %r8, %r10
-; FALLBACK2-NEXT: addq %rsi, %rsi
-; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi
-; FALLBACK2-NEXT: orq %r11, %rsi
-; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8
-; FALLBACK2-NEXT: shlxq %r12, %r8, %r8
-; FALLBACK2-NEXT: orq %r15, %r8
-; FALLBACK2-NEXT: addq %r14, %r14
-; FALLBACK2-NEXT: shlxq %r12, %r14, %r11
-; FALLBACK2-NEXT: orq %rbp, %r11
-; FALLBACK2-NEXT: addq %rax, %rax
-; FALLBACK2-NEXT: shlxq %r12, %rax, %rax
-; FALLBACK2-NEXT: orq %r13, %rax
-; FALLBACK2-NEXT: movq %rcx, 56(%rdx)
-; FALLBACK2-NEXT: movq %rax, 48(%rdx)
-; FALLBACK2-NEXT: movq %r11, 32(%rdx)
-; FALLBACK2-NEXT: movq %r8, 40(%rdx)
-; FALLBACK2-NEXT: movq %rsi, 16(%rdx)
-; FALLBACK2-NEXT: movq %r10, 24(%rdx)
-; FALLBACK2-NEXT: movq %rdi, (%rdx)
-; FALLBACK2-NEXT: movq %r9, 8(%rdx)
-; FALLBACK2-NEXT: addq $8, %rsp
-; FALLBACK2-NEXT: popq %rbx
-; FALLBACK2-NEXT: popq %r12
-; FALLBACK2-NEXT: popq %r13
-; FALLBACK2-NEXT: popq %r14
-; FALLBACK2-NEXT: popq %r15
-; FALLBACK2-NEXT: popq %rbp
-; FALLBACK2-NEXT: retq
-;
-; FALLBACK3-LABEL: ashr_64bytes:
-; FALLBACK3: # %bb.0:
-; FALLBACK3-NEXT: pushq %r15
-; FALLBACK3-NEXT: pushq %r14
-; FALLBACK3-NEXT: pushq %rbx
-; FALLBACK3-NEXT: movq (%rdi), %rcx
-; FALLBACK3-NEXT: movq 8(%rdi), %r8
-; FALLBACK3-NEXT: movq 16(%rdi), %r9
-; FALLBACK3-NEXT: movq 24(%rdi), %r10
-; FALLBACK3-NEXT: movq 32(%rdi), %r11
-; FALLBACK3-NEXT: movq 40(%rdi), %rbx
-; FALLBACK3-NEXT: movq 48(%rdi), %r14
-; FALLBACK3-NEXT: movq 56(%rdi), %rdi
-; FALLBACK3-NEXT: movl (%rsi), %eax
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: sarq $63, %rdi
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK3-NEXT: leal (,%rax,8), %ecx
-; FALLBACK3-NEXT: andl $56, %ecx
-; FALLBACK3-NEXT: andl $56, %eax
-; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi
-; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi
-; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9
-; FALLBACK3-NEXT: movq %r9, %r8
-; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8
-; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10
-; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11
-; FALLBACK3-NEXT: movq %r11, %rbx
-; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx
-; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11
-; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14
-; FALLBACK3-NEXT: movq %r14, %r15
-; FALLBACK3-NEXT: shrdq %cl, %r11, %r15
-; FALLBACK3-NEXT: shrdq %cl, %r14, %r10
-; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK3-NEXT: shrdq %cl, %rax, %r11
-; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax
-; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi
-; FALLBACK3-NEXT: movq %r11, 48(%rdx)
-; FALLBACK3-NEXT: movq %r10, 32(%rdx)
-; FALLBACK3-NEXT: movq %r15, 40(%rdx)
-; FALLBACK3-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK3-NEXT: movq %rbx, 24(%rdx)
-; FALLBACK3-NEXT: movq %rsi, (%rdx)
-; FALLBACK3-NEXT: movq %r8, 8(%rdx)
-; FALLBACK3-NEXT: movq %rax, 56(%rdx)
-; FALLBACK3-NEXT: popq %rbx
-; FALLBACK3-NEXT: popq %r14
-; FALLBACK3-NEXT: popq %r15
-; FALLBACK3-NEXT: retq
-;
-; FALLBACK4-LABEL: ashr_64bytes:
-; FALLBACK4: # %bb.0:
-; FALLBACK4-NEXT: pushq %rbp
-; FALLBACK4-NEXT: pushq %r15
-; FALLBACK4-NEXT: pushq %r14
-; FALLBACK4-NEXT: pushq %r13
-; FALLBACK4-NEXT: pushq %r12
-; FALLBACK4-NEXT: pushq %rbx
-; FALLBACK4-NEXT: pushq %rax
-; FALLBACK4-NEXT: movups (%rdi), %xmm0
-; FALLBACK4-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK4-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK4-NEXT: movq 48(%rdi), %rax
-; FALLBACK4-NEXT: movq 56(%rdi), %rcx
-; FALLBACK4-NEXT: movl (%rsi), %edi
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: sarq $63, %rcx
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK4-NEXT: leal (,%rdi,8), %eax
-; FALLBACK4-NEXT: andl $56, %eax
-; FALLBACK4-NEXT: andl $56, %edi
-; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %r10
-; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r9
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r10
-; FALLBACK4-NEXT: movl %eax, %esi
-; FALLBACK4-NEXT: notb %sil
-; FALLBACK4-NEXT: leaq (%r9,%r9), %r8
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r8
-; FALLBACK4-NEXT: orq %r10, %r8
-; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %r10
-; FALLBACK4-NEXT: movq %r10, %rbx
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %rbx
-; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %r12
-; FALLBACK4-NEXT: leaq (%r12,%r12), %r11
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r11
-; FALLBACK4-NEXT: orq %rbx, %r11
-; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %rbx
-; FALLBACK4-NEXT: movq %rbx, %r14
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r14
-; FALLBACK4-NEXT: addq %r10, %r10
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r10
-; FALLBACK4-NEXT: orq %r14, %r10
-; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r14
-; FALLBACK4-NEXT: movq %r14, %r13
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r13
-; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %rbp
-; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r15
-; FALLBACK4-NEXT: orq %r13, %r15
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r12
-; FALLBACK4-NEXT: addq %r14, %r14
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r14
-; FALLBACK4-NEXT: orq %r12, %r14
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %rbp
-; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %rdi
-; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r12
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %r12
-; FALLBACK4-NEXT: orq %rbp, %r12
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: shrq %cl, %r9
-; FALLBACK4-NEXT: addq %rbx, %rbx
-; FALLBACK4-NEXT: movl %esi, %ecx
-; FALLBACK4-NEXT: shlq %cl, %rbx
-; FALLBACK4-NEXT: orq %r9, %rbx
-; FALLBACK4-NEXT: movl %eax, %ecx
-; FALLBACK4-NEXT: sarq %cl, %rdi
-; FALLBACK4-NEXT: movq %rdi, 56(%rdx)
-; FALLBACK4-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK4-NEXT: movq %r12, 48(%rdx)
-; FALLBACK4-NEXT: movq %r14, 32(%rdx)
-; FALLBACK4-NEXT: movq %r15, 40(%rdx)
-; FALLBACK4-NEXT: movq %r10, 16(%rdx)
-; FALLBACK4-NEXT: movq %r11, 24(%rdx)
-; FALLBACK4-NEXT: movq %r8, (%rdx)
-; FALLBACK4-NEXT: addq $8, %rsp
-; FALLBACK4-NEXT: popq %rbx
-; FALLBACK4-NEXT: popq %r12
-; FALLBACK4-NEXT: popq %r13
-; FALLBACK4-NEXT: popq %r14
-; FALLBACK4-NEXT: popq %r15
-; FALLBACK4-NEXT: popq %rbp
-; FALLBACK4-NEXT: retq
-;
-; FALLBACK5-LABEL: ashr_64bytes:
-; FALLBACK5: # %bb.0:
-; FALLBACK5-NEXT: pushq %r15
-; FALLBACK5-NEXT: pushq %r14
-; FALLBACK5-NEXT: pushq %rbx
-; FALLBACK5-NEXT: movups (%rdi), %xmm0
-; FALLBACK5-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK5-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK5-NEXT: movq 48(%rdi), %rcx
-; FALLBACK5-NEXT: movq 56(%rdi), %rdi
-; FALLBACK5-NEXT: movl (%rsi), %eax
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: sarq $63, %rdi
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK5-NEXT: leal (,%rax,8), %ecx
-; FALLBACK5-NEXT: andl $56, %ecx
-; FALLBACK5-NEXT: andl $56, %eax
-; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK5-NEXT: movq %r9, %rsi
-; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK5-NEXT: movq %r10, %r8
-; FALLBACK5-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK5-NEXT: movq %r11, %rbx
-; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK5-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK5-NEXT: movq %rax, %r15
-; FALLBACK5-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK5-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK5-NEXT: sarq %cl, %r11
-; FALLBACK5-NEXT: movq %r15, 8(%rdx)
-; FALLBACK5-NEXT: movq %r9, 48(%rdx)
-; FALLBACK5-NEXT: movq %r11, 56(%rdx)
-; FALLBACK5-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK5-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK5-NEXT: movq %r8, 16(%rdx)
-; FALLBACK5-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK5-NEXT: movq %r14, (%rdx)
-; FALLBACK5-NEXT: popq %rbx
-; FALLBACK5-NEXT: popq %r14
-; FALLBACK5-NEXT: popq %r15
-; FALLBACK5-NEXT: retq
-;
-; FALLBACK6-LABEL: ashr_64bytes:
-; FALLBACK6: # %bb.0:
-; FALLBACK6-NEXT: pushq %rbp
-; FALLBACK6-NEXT: pushq %r15
-; FALLBACK6-NEXT: pushq %r14
-; FALLBACK6-NEXT: pushq %r13
-; FALLBACK6-NEXT: pushq %r12
-; FALLBACK6-NEXT: pushq %rbx
-; FALLBACK6-NEXT: pushq %rax
-; FALLBACK6-NEXT: movups (%rdi), %xmm0
-; FALLBACK6-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK6-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK6-NEXT: movq 48(%rdi), %rcx
-; FALLBACK6-NEXT: movq 56(%rdi), %rdi
-; FALLBACK6-NEXT: movl (%rsi), %eax
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: sarq $63, %rdi
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK6-NEXT: leal (,%rax,8), %esi
-; FALLBACK6-NEXT: andl $56, %esi
-; FALLBACK6-NEXT: andl $56, %eax
-; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx
-; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi
-; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12
-; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13
-; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9
-; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10
-; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14
-; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15
-; FALLBACK6-NEXT: movl %esi, %ebx
-; FALLBACK6-NEXT: notb %bl
-; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp
-; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8
-; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8
-; FALLBACK6-NEXT: orq %r11, %r8
-; FALLBACK6-NEXT: leaq (%r13,%r13), %r11
-; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11
-; FALLBACK6-NEXT: orq %r12, %r11
-; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12
-; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13
-; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp
-; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi
-; FALLBACK6-NEXT: addq %rdi, %rdi
-; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi
-; FALLBACK6-NEXT: orq %r9, %rdi
-; FALLBACK6-NEXT: leaq (%r12,%r12), %r9
-; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9
-; FALLBACK6-NEXT: orq %r14, %r9
-; FALLBACK6-NEXT: addq %r10, %r10
-; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10
-; FALLBACK6-NEXT: orq %r15, %r10
-; FALLBACK6-NEXT: addq %rax, %rax
-; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax
-; FALLBACK6-NEXT: orq %r13, %rax
-; FALLBACK6-NEXT: addq %rcx, %rcx
-; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx
-; FALLBACK6-NEXT: orq %rbp, %rcx
-; FALLBACK6-NEXT: movq %rsi, 56(%rdx)
-; FALLBACK6-NEXT: movq %rcx, 8(%rdx)
-; FALLBACK6-NEXT: movq %rax, 48(%rdx)
-; FALLBACK6-NEXT: movq %r10, 32(%rdx)
-; FALLBACK6-NEXT: movq %r9, 40(%rdx)
-; FALLBACK6-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK6-NEXT: movq %r11, 24(%rdx)
-; FALLBACK6-NEXT: movq %r8, (%rdx)
-; FALLBACK6-NEXT: addq $8, %rsp
-; FALLBACK6-NEXT: popq %rbx
-; FALLBACK6-NEXT: popq %r12
-; FALLBACK6-NEXT: popq %r13
-; FALLBACK6-NEXT: popq %r14
-; FALLBACK6-NEXT: popq %r15
-; FALLBACK6-NEXT: popq %rbp
-; FALLBACK6-NEXT: retq
-;
-; FALLBACK7-LABEL: ashr_64bytes:
-; FALLBACK7: # %bb.0:
-; FALLBACK7-NEXT: pushq %r15
-; FALLBACK7-NEXT: pushq %r14
-; FALLBACK7-NEXT: pushq %rbx
-; FALLBACK7-NEXT: movups (%rdi), %xmm0
-; FALLBACK7-NEXT: movups 16(%rdi), %xmm1
-; FALLBACK7-NEXT: movups 32(%rdi), %xmm2
-; FALLBACK7-NEXT: movq 48(%rdi), %rcx
-; FALLBACK7-NEXT: movq 56(%rdi), %rdi
-; FALLBACK7-NEXT: movl (%rsi), %eax
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: sarq $63, %rdi
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK7-NEXT: leal (,%rax,8), %ecx
-; FALLBACK7-NEXT: andl $56, %ecx
-; FALLBACK7-NEXT: andl $56, %eax
-; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK7-NEXT: movq %r9, %rsi
-; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK7-NEXT: movq %r10, %r8
-; FALLBACK7-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK7-NEXT: movq %r11, %rbx
-; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK7-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK7-NEXT: movq %rax, %r15
-; FALLBACK7-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK7-NEXT: sarxq %rcx, %r11, %r10
-; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK7-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK7-NEXT: movq %r15, 8(%rdx)
-; FALLBACK7-NEXT: movq %r9, 48(%rdx)
-; FALLBACK7-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK7-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK7-NEXT: movq %r8, 16(%rdx)
-; FALLBACK7-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK7-NEXT: movq %r14, (%rdx)
-; FALLBACK7-NEXT: movq %r10, 56(%rdx)
-; FALLBACK7-NEXT: popq %rbx
-; FALLBACK7-NEXT: popq %r14
-; FALLBACK7-NEXT: popq %r15
-; FALLBACK7-NEXT: retq
-;
-; FALLBACK8-LABEL: ashr_64bytes:
-; FALLBACK8: # %bb.0:
-; FALLBACK8-NEXT: pushq %rbp
-; FALLBACK8-NEXT: pushq %r15
-; FALLBACK8-NEXT: pushq %r14
-; FALLBACK8-NEXT: pushq %r13
-; FALLBACK8-NEXT: pushq %r12
-; FALLBACK8-NEXT: pushq %rbx
-; FALLBACK8-NEXT: pushq %rax
-; FALLBACK8-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK8-NEXT: vmovups 32(%rdi), %xmm1
-; FALLBACK8-NEXT: movq 48(%rdi), %rax
-; FALLBACK8-NEXT: movq 56(%rdi), %rcx
-; FALLBACK8-NEXT: movl (%rsi), %edi
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: sarq $63, %rcx
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK8-NEXT: leal (,%rdi,8), %eax
-; FALLBACK8-NEXT: andl $56, %eax
-; FALLBACK8-NEXT: andl $56, %edi
-; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %r10
-; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r9
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r10
-; FALLBACK8-NEXT: movl %eax, %esi
-; FALLBACK8-NEXT: notb %sil
-; FALLBACK8-NEXT: leaq (%r9,%r9), %r8
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r8
-; FALLBACK8-NEXT: orq %r10, %r8
-; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %r10
-; FALLBACK8-NEXT: movq %r10, %rbx
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %rbx
-; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %r12
-; FALLBACK8-NEXT: leaq (%r12,%r12), %r11
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r11
-; FALLBACK8-NEXT: orq %rbx, %r11
-; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %rbx
-; FALLBACK8-NEXT: movq %rbx, %r14
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r14
-; FALLBACK8-NEXT: addq %r10, %r10
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r10
-; FALLBACK8-NEXT: orq %r14, %r10
-; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r14
-; FALLBACK8-NEXT: movq %r14, %r13
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r13
-; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %rbp
-; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r15
-; FALLBACK8-NEXT: orq %r13, %r15
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r12
-; FALLBACK8-NEXT: addq %r14, %r14
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r14
-; FALLBACK8-NEXT: orq %r12, %r14
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %rbp
-; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %rdi
-; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r12
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %r12
-; FALLBACK8-NEXT: orq %rbp, %r12
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: shrq %cl, %r9
-; FALLBACK8-NEXT: addq %rbx, %rbx
-; FALLBACK8-NEXT: movl %esi, %ecx
-; FALLBACK8-NEXT: shlq %cl, %rbx
-; FALLBACK8-NEXT: orq %r9, %rbx
-; FALLBACK8-NEXT: movl %eax, %ecx
-; FALLBACK8-NEXT: sarq %cl, %rdi
-; FALLBACK8-NEXT: movq %rdi, 56(%rdx)
-; FALLBACK8-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK8-NEXT: movq %r12, 48(%rdx)
-; FALLBACK8-NEXT: movq %r14, 32(%rdx)
-; FALLBACK8-NEXT: movq %r15, 40(%rdx)
-; FALLBACK8-NEXT: movq %r10, 16(%rdx)
-; FALLBACK8-NEXT: movq %r11, 24(%rdx)
-; FALLBACK8-NEXT: movq %r8, (%rdx)
-; FALLBACK8-NEXT: addq $8, %rsp
-; FALLBACK8-NEXT: popq %rbx
-; FALLBACK8-NEXT: popq %r12
-; FALLBACK8-NEXT: popq %r13
-; FALLBACK8-NEXT: popq %r14
-; FALLBACK8-NEXT: popq %r15
-; FALLBACK8-NEXT: popq %rbp
-; FALLBACK8-NEXT: vzeroupper
-; FALLBACK8-NEXT: retq
-;
-; FALLBACK9-LABEL: ashr_64bytes:
-; FALLBACK9: # %bb.0:
-; FALLBACK9-NEXT: pushq %r15
-; FALLBACK9-NEXT: pushq %r14
-; FALLBACK9-NEXT: pushq %rbx
-; FALLBACK9-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK9-NEXT: vmovups 32(%rdi), %xmm1
-; FALLBACK9-NEXT: movq 48(%rdi), %rcx
-; FALLBACK9-NEXT: movq 56(%rdi), %rdi
-; FALLBACK9-NEXT: movl (%rsi), %eax
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: sarq $63, %rdi
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT: leal (,%rax,8), %ecx
-; FALLBACK9-NEXT: andl $56, %ecx
-; FALLBACK9-NEXT: andl $56, %eax
-; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK9-NEXT: movq %r9, %rsi
-; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK9-NEXT: movq %r10, %r8
-; FALLBACK9-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK9-NEXT: movq %r11, %rbx
-; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK9-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK9-NEXT: movq %rax, %r15
-; FALLBACK9-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK9-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK9-NEXT: sarq %cl, %r11
-; FALLBACK9-NEXT: movq %r15, 8(%rdx)
-; FALLBACK9-NEXT: movq %r9, 48(%rdx)
-; FALLBACK9-NEXT: movq %r11, 56(%rdx)
-; FALLBACK9-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK9-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK9-NEXT: movq %r8, 16(%rdx)
-; FALLBACK9-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK9-NEXT: movq %r14, (%rdx)
-; FALLBACK9-NEXT: popq %rbx
-; FALLBACK9-NEXT: popq %r14
-; FALLBACK9-NEXT: popq %r15
-; FALLBACK9-NEXT: vzeroupper
-; FALLBACK9-NEXT: retq
-;
-; FALLBACK10-LABEL: ashr_64bytes:
-; FALLBACK10: # %bb.0:
-; FALLBACK10-NEXT: pushq %rbp
-; FALLBACK10-NEXT: pushq %r15
-; FALLBACK10-NEXT: pushq %r14
-; FALLBACK10-NEXT: pushq %r13
-; FALLBACK10-NEXT: pushq %r12
-; FALLBACK10-NEXT: pushq %rbx
-; FALLBACK10-NEXT: pushq %rax
-; FALLBACK10-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1
-; FALLBACK10-NEXT: movq 48(%rdi), %rcx
-; FALLBACK10-NEXT: movq 56(%rdi), %rdi
-; FALLBACK10-NEXT: movl (%rsi), %eax
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: sarq $63, %rdi
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK10-NEXT: leal (,%rax,8), %esi
-; FALLBACK10-NEXT: andl $56, %esi
-; FALLBACK10-NEXT: andl $56, %eax
-; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx
-; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi
-; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12
-; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13
-; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9
-; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10
-; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14
-; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15
-; FALLBACK10-NEXT: movl %esi, %ebx
-; FALLBACK10-NEXT: notb %bl
-; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp
-; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8
-; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8
-; FALLBACK10-NEXT: orq %r11, %r8
-; FALLBACK10-NEXT: leaq (%r13,%r13), %r11
-; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11
-; FALLBACK10-NEXT: orq %r12, %r11
-; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12
-; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13
-; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp
-; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi
-; FALLBACK10-NEXT: addq %rdi, %rdi
-; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi
-; FALLBACK10-NEXT: orq %r9, %rdi
-; FALLBACK10-NEXT: leaq (%r12,%r12), %r9
-; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9
-; FALLBACK10-NEXT: orq %r14, %r9
-; FALLBACK10-NEXT: addq %r10, %r10
-; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10
-; FALLBACK10-NEXT: orq %r15, %r10
-; FALLBACK10-NEXT: addq %rax, %rax
-; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax
-; FALLBACK10-NEXT: orq %r13, %rax
-; FALLBACK10-NEXT: addq %rcx, %rcx
-; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx
-; FALLBACK10-NEXT: orq %rbp, %rcx
-; FALLBACK10-NEXT: movq %rsi, 56(%rdx)
-; FALLBACK10-NEXT: movq %rcx, 8(%rdx)
-; FALLBACK10-NEXT: movq %rax, 48(%rdx)
-; FALLBACK10-NEXT: movq %r10, 32(%rdx)
-; FALLBACK10-NEXT: movq %r9, 40(%rdx)
-; FALLBACK10-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK10-NEXT: movq %r11, 24(%rdx)
-; FALLBACK10-NEXT: movq %r8, (%rdx)
-; FALLBACK10-NEXT: addq $8, %rsp
-; FALLBACK10-NEXT: popq %rbx
-; FALLBACK10-NEXT: popq %r12
-; FALLBACK10-NEXT: popq %r13
-; FALLBACK10-NEXT: popq %r14
-; FALLBACK10-NEXT: popq %r15
-; FALLBACK10-NEXT: popq %rbp
-; FALLBACK10-NEXT: vzeroupper
-; FALLBACK10-NEXT: retq
-;
-; FALLBACK11-LABEL: ashr_64bytes:
-; FALLBACK11: # %bb.0:
-; FALLBACK11-NEXT: pushq %r15
-; FALLBACK11-NEXT: pushq %r14
-; FALLBACK11-NEXT: pushq %rbx
-; FALLBACK11-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK11-NEXT: vmovups 32(%rdi), %xmm1
-; FALLBACK11-NEXT: movq 48(%rdi), %rcx
-; FALLBACK11-NEXT: movq 56(%rdi), %rdi
-; FALLBACK11-NEXT: movl (%rsi), %eax
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: sarq $63, %rdi
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT: leal (,%rax,8), %ecx
-; FALLBACK11-NEXT: andl $56, %ecx
-; FALLBACK11-NEXT: andl $56, %eax
-; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK11-NEXT: movq %r9, %rsi
-; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK11-NEXT: movq %r10, %r8
-; FALLBACK11-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK11-NEXT: movq %r11, %rbx
-; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK11-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK11-NEXT: movq %rax, %r15
-; FALLBACK11-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK11-NEXT: sarxq %rcx, %r11, %r10
-; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK11-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK11-NEXT: movq %r15, 8(%rdx)
-; FALLBACK11-NEXT: movq %r9, 48(%rdx)
-; FALLBACK11-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK11-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK11-NEXT: movq %r8, 16(%rdx)
-; FALLBACK11-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK11-NEXT: movq %r14, (%rdx)
-; FALLBACK11-NEXT: movq %r10, 56(%rdx)
-; FALLBACK11-NEXT: popq %rbx
-; FALLBACK11-NEXT: popq %r14
-; FALLBACK11-NEXT: popq %r15
-; FALLBACK11-NEXT: vzeroupper
-; FALLBACK11-NEXT: retq
-;
-; FALLBACK12-LABEL: ashr_64bytes:
-; FALLBACK12: # %bb.0:
-; FALLBACK12-NEXT: pushq %rbp
-; FALLBACK12-NEXT: pushq %r15
-; FALLBACK12-NEXT: pushq %r14
-; FALLBACK12-NEXT: pushq %r13
-; FALLBACK12-NEXT: pushq %r12
-; FALLBACK12-NEXT: pushq %rbx
-; FALLBACK12-NEXT: pushq %rax
-; FALLBACK12-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK12-NEXT: vmovups 32(%rdi), %xmm1
-; FALLBACK12-NEXT: movq 48(%rdi), %rax
-; FALLBACK12-NEXT: movq 56(%rdi), %rcx
-; FALLBACK12-NEXT: movl (%rsi), %edi
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: sarq $63, %rcx
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK12-NEXT: leal (,%rdi,8), %eax
-; FALLBACK12-NEXT: andl $56, %eax
-; FALLBACK12-NEXT: andl $56, %edi
-; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %r10
-; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r9
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r10
-; FALLBACK12-NEXT: movl %eax, %esi
-; FALLBACK12-NEXT: notb %sil
-; FALLBACK12-NEXT: leaq (%r9,%r9), %r8
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r8
-; FALLBACK12-NEXT: orq %r10, %r8
-; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %r10
-; FALLBACK12-NEXT: movq %r10, %rbx
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %rbx
-; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %r12
-; FALLBACK12-NEXT: leaq (%r12,%r12), %r11
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r11
-; FALLBACK12-NEXT: orq %rbx, %r11
-; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %rbx
-; FALLBACK12-NEXT: movq %rbx, %r14
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r14
-; FALLBACK12-NEXT: addq %r10, %r10
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r10
-; FALLBACK12-NEXT: orq %r14, %r10
-; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r14
-; FALLBACK12-NEXT: movq %r14, %r13
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r13
-; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %rbp
-; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r15
-; FALLBACK12-NEXT: orq %r13, %r15
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r12
-; FALLBACK12-NEXT: addq %r14, %r14
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r14
-; FALLBACK12-NEXT: orq %r12, %r14
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %rbp
-; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %rdi
-; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r12
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %r12
-; FALLBACK12-NEXT: orq %rbp, %r12
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: shrq %cl, %r9
-; FALLBACK12-NEXT: addq %rbx, %rbx
-; FALLBACK12-NEXT: movl %esi, %ecx
-; FALLBACK12-NEXT: shlq %cl, %rbx
-; FALLBACK12-NEXT: orq %r9, %rbx
-; FALLBACK12-NEXT: movl %eax, %ecx
-; FALLBACK12-NEXT: sarq %cl, %rdi
-; FALLBACK12-NEXT: movq %rdi, 56(%rdx)
-; FALLBACK12-NEXT: movq %rbx, 8(%rdx)
-; FALLBACK12-NEXT: movq %r12, 48(%rdx)
-; FALLBACK12-NEXT: movq %r14, 32(%rdx)
-; FALLBACK12-NEXT: movq %r15, 40(%rdx)
-; FALLBACK12-NEXT: movq %r10, 16(%rdx)
-; FALLBACK12-NEXT: movq %r11, 24(%rdx)
-; FALLBACK12-NEXT: movq %r8, (%rdx)
-; FALLBACK12-NEXT: addq $8, %rsp
-; FALLBACK12-NEXT: popq %rbx
-; FALLBACK12-NEXT: popq %r12
-; FALLBACK12-NEXT: popq %r13
-; FALLBACK12-NEXT: popq %r14
-; FALLBACK12-NEXT: popq %r15
-; FALLBACK12-NEXT: popq %rbp
-; FALLBACK12-NEXT: vzeroupper
-; FALLBACK12-NEXT: retq
-;
-; FALLBACK13-LABEL: ashr_64bytes:
-; FALLBACK13: # %bb.0:
-; FALLBACK13-NEXT: pushq %r15
-; FALLBACK13-NEXT: pushq %r14
-; FALLBACK13-NEXT: pushq %rbx
-; FALLBACK13-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK13-NEXT: vmovups 32(%rdi), %xmm1
-; FALLBACK13-NEXT: movq 48(%rdi), %rcx
-; FALLBACK13-NEXT: movq 56(%rdi), %rdi
-; FALLBACK13-NEXT: movl (%rsi), %eax
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: sarq $63, %rdi
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK13-NEXT: leal (,%rax,8), %ecx
-; FALLBACK13-NEXT: andl $56, %ecx
-; FALLBACK13-NEXT: andl $56, %eax
-; FALLBACK13-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK13-NEXT: movq %r9, %rsi
-; FALLBACK13-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK13-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK13-NEXT: movq %r10, %r8
-; FALLBACK13-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK13-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK13-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK13-NEXT: movq %r11, %rbx
-; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK13-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK13-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK13-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK13-NEXT: movq %rax, %r15
-; FALLBACK13-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK13-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK13-NEXT: sarq %cl, %r11
-; FALLBACK13-NEXT: movq %r15, 8(%rdx)
-; FALLBACK13-NEXT: movq %r9, 48(%rdx)
-; FALLBACK13-NEXT: movq %r11, 56(%rdx)
-; FALLBACK13-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK13-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK13-NEXT: movq %r8, 16(%rdx)
-; FALLBACK13-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK13-NEXT: movq %r14, (%rdx)
-; FALLBACK13-NEXT: popq %rbx
-; FALLBACK13-NEXT: popq %r14
-; FALLBACK13-NEXT: popq %r15
-; FALLBACK13-NEXT: vzeroupper
-; FALLBACK13-NEXT: retq
-;
-; FALLBACK14-LABEL: ashr_64bytes:
-; FALLBACK14: # %bb.0:
-; FALLBACK14-NEXT: pushq %rbp
-; FALLBACK14-NEXT: pushq %r15
-; FALLBACK14-NEXT: pushq %r14
-; FALLBACK14-NEXT: pushq %r13
-; FALLBACK14-NEXT: pushq %r12
-; FALLBACK14-NEXT: pushq %rbx
-; FALLBACK14-NEXT: pushq %rax
-; FALLBACK14-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1
-; FALLBACK14-NEXT: movq 48(%rdi), %rcx
-; FALLBACK14-NEXT: movq 56(%rdi), %rdi
-; FALLBACK14-NEXT: movl (%rsi), %eax
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: sarq $63, %rdi
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK14-NEXT: leal (,%rax,8), %esi
-; FALLBACK14-NEXT: andl $56, %esi
-; FALLBACK14-NEXT: andl $56, %eax
-; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11
-; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx
-; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi
-; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12
-; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13
-; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9
-; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10
-; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14
-; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15
-; FALLBACK14-NEXT: movl %esi, %ebx
-; FALLBACK14-NEXT: notb %bl
-; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp
-; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8
-; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8
-; FALLBACK14-NEXT: orq %r11, %r8
-; FALLBACK14-NEXT: leaq (%r13,%r13), %r11
-; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11
-; FALLBACK14-NEXT: orq %r12, %r11
-; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12
-; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13
-; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp
-; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax
-; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi
-; FALLBACK14-NEXT: addq %rdi, %rdi
-; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi
-; FALLBACK14-NEXT: orq %r9, %rdi
-; FALLBACK14-NEXT: leaq (%r12,%r12), %r9
-; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9
-; FALLBACK14-NEXT: orq %r14, %r9
-; FALLBACK14-NEXT: addq %r10, %r10
-; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10
-; FALLBACK14-NEXT: orq %r15, %r10
-; FALLBACK14-NEXT: addq %rax, %rax
-; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax
-; FALLBACK14-NEXT: orq %r13, %rax
-; FALLBACK14-NEXT: addq %rcx, %rcx
-; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx
-; FALLBACK14-NEXT: orq %rbp, %rcx
-; FALLBACK14-NEXT: movq %rsi, 56(%rdx)
-; FALLBACK14-NEXT: movq %rcx, 8(%rdx)
-; FALLBACK14-NEXT: movq %rax, 48(%rdx)
-; FALLBACK14-NEXT: movq %r10, 32(%rdx)
-; FALLBACK14-NEXT: movq %r9, 40(%rdx)
-; FALLBACK14-NEXT: movq %rdi, 16(%rdx)
-; FALLBACK14-NEXT: movq %r11, 24(%rdx)
-; FALLBACK14-NEXT: movq %r8, (%rdx)
-; FALLBACK14-NEXT: addq $8, %rsp
-; FALLBACK14-NEXT: popq %rbx
-; FALLBACK14-NEXT: popq %r12
-; FALLBACK14-NEXT: popq %r13
-; FALLBACK14-NEXT: popq %r14
-; FALLBACK14-NEXT: popq %r15
-; FALLBACK14-NEXT: popq %rbp
-; FALLBACK14-NEXT: vzeroupper
-; FALLBACK14-NEXT: retq
-;
-; FALLBACK15-LABEL: ashr_64bytes:
-; FALLBACK15: # %bb.0:
-; FALLBACK15-NEXT: pushq %r15
-; FALLBACK15-NEXT: pushq %r14
-; FALLBACK15-NEXT: pushq %rbx
-; FALLBACK15-NEXT: vmovups (%rdi), %ymm0
-; FALLBACK15-NEXT: vmovups 32(%rdi), %xmm1
-; FALLBACK15-NEXT: movq 48(%rdi), %rcx
-; FALLBACK15-NEXT: movq 56(%rdi), %rdi
-; FALLBACK15-NEXT: movl (%rsi), %eax
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: sarq $63, %rdi
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT: leal (,%rax,8), %ecx
-; FALLBACK15-NEXT: andl $56, %ecx
-; FALLBACK15-NEXT: andl $56, %eax
-; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi
-; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9
-; FALLBACK15-NEXT: movq %r9, %rsi
-; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi
-; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10
-; FALLBACK15-NEXT: movq %r10, %r8
-; FALLBACK15-NEXT: shrdq %cl, %r9, %r8
-; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9
-; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11
-; FALLBACK15-NEXT: movq %r11, %rbx
-; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx
-; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi
-; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11
-; FALLBACK15-NEXT: shrdq %cl, %r11, %r9
-; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14
-; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax
-; FALLBACK15-NEXT: movq %rax, %r15
-; FALLBACK15-NEXT: shrdq %cl, %r10, %r15
-; FALLBACK15-NEXT: sarxq %rcx, %r11, %r10
-; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx
-; FALLBACK15-NEXT: shrdq %cl, %rax, %r14
-; FALLBACK15-NEXT: movq %r15, 8(%rdx)
-; FALLBACK15-NEXT: movq %r9, 48(%rdx)
-; FALLBACK15-NEXT: movq %rdi, 32(%rdx)
-; FALLBACK15-NEXT: movq %rbx, 40(%rdx)
-; FALLBACK15-NEXT: movq %r8, 16(%rdx)
-; FALLBACK15-NEXT: movq %rsi, 24(%rdx)
-; FALLBACK15-NEXT: movq %r14, (%rdx)
-; FALLBACK15-NEXT: movq %r10, 56(%rdx)
-; FALLBACK15-NEXT: popq %rbx
-; FALLBACK15-NEXT: popq %r14
-; FALLBACK15-NEXT: popq %r15
-; FALLBACK15-NEXT: vzeroupper
-; FALLBACK15-NEXT: retq
-;
-; FALLBACK16-LABEL: ashr_64bytes:
-; FALLBACK16: # %bb.0:
-; FALLBACK16-NEXT: pushl %ebp
-; FALLBACK16-NEXT: pushl %ebx
-; FALLBACK16-NEXT: pushl %edi
-; FALLBACK16-NEXT: pushl %esi
-; FALLBACK16-NEXT: subl $204, %esp
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT: movl (%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 4(%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 8(%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 12(%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 16(%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 20(%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 24(%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 28(%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 32(%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 36(%ecx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 40(%ecx), %ebx
-; FALLBACK16-NEXT: movl 44(%ecx), %edi
-; FALLBACK16-NEXT: movl 48(%ecx), %esi
-; FALLBACK16-NEXT: movl 52(%ecx), %edx
-; FALLBACK16-NEXT: movl 56(%ecx), %eax
-; FALLBACK16-NEXT: movl 60(%ecx), %ecx
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK16-NEXT: movl (%ebp), %ebp
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: sarl $31, %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT: movl %ebp, %ecx
-; FALLBACK16-NEXT: movl %ebp, %esi
-; FALLBACK16-NEXT: andl $60, %esi
-; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx
-; FALLBACK16-NEXT: shll $3, %ecx
-; FALLBACK16-NEXT: andl $24, %ecx
-; FALLBACK16-NEXT: movl %edx, %eax
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: addl %edi, %edi
-; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK16-NEXT: movl %ecx, %ebx
-; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK16-NEXT: notb %ch
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: orl %eax, %edi
-; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: addl %edx, %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: orl %eax, %edx
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp
-; FALLBACK16-NEXT: movl %ebp, %edx
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %edx
-; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi
-; FALLBACK16-NEXT: leal (%edi,%edi), %eax
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: orl %edx, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: addl %ebp, %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %eax, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl %esi, %edx
-; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi
-; FALLBACK16-NEXT: leal (%esi,%esi), %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %eax, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT: addl %ebx, %ebx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %edi, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl %edx, %eax
-; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp
-; FALLBACK16-NEXT: movl %ebp, %edx
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %edx
-; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi
-; FALLBACK16-NEXT: leal (%edi,%edi), %eax
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %eax
-; FALLBACK16-NEXT: orl %edx, %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %esi
-; FALLBACK16-NEXT: addl %ebp, %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %esi, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax
-; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi
-; FALLBACK16-NEXT: leal (%esi,%esi), %ebp
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %eax, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl %ebx, %edx
-; FALLBACK16-NEXT: movb %dl, %cl
-; FALLBACK16-NEXT: shrl %cl, %edi
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT: addl %ebx, %ebx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebx
-; FALLBACK16-NEXT: orl %edi, %ebx
-; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi
-; FALLBACK16-NEXT: movl %edi, %eax
-; FALLBACK16-NEXT: movl %edx, %ebx
-; FALLBACK16-NEXT: movl %ebx, %ecx
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx
-; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movl %ebp, %edx
-; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp
-; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %eax, %ebp
-; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: shrl %cl, %esi
-; FALLBACK16-NEXT: addl %edi, %edi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edi
-; FALLBACK16-NEXT: orl %esi, %edi
-; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi
-; FALLBACK16-NEXT: movl %esi, %eax
-; FALLBACK16-NEXT: movl %ebx, %ecx
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx
-; FALLBACK16-NEXT: leal (%edx,%edx), %ebp
-; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %ebp
-; FALLBACK16-NEXT: orl %eax, %ebp
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: addl %esi, %esi
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %esi
-; FALLBACK16-NEXT: orl %eax, %esi
-; FALLBACK16-NEXT: movb %bl, %cl
-; FALLBACK16-NEXT: movl %edx, %eax
-; FALLBACK16-NEXT: shrl %cl, %eax
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx
-; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx
-; FALLBACK16-NEXT: movb %ch, %cl
-; FALLBACK16-NEXT: shll %cl, %edx
-; FALLBACK16-NEXT: orl %eax, %edx
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK16-NEXT: sarl %cl, %ebx
-; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT: movl %ebx, 60(%eax)
-; FALLBACK16-NEXT: movl %edx, 56(%eax)
-; FALLBACK16-NEXT: movl %esi, 48(%eax)
-; FALLBACK16-NEXT: movl %ebp, 52(%eax)
-; FALLBACK16-NEXT: movl %edi, 40(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 44(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 32(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 36(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 24(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 28(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 16(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 20(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 8(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 12(%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, (%eax)
-; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT: movl %ecx, 4(%eax)
-; FALLBACK16-NEXT: addl $204, %esp
-; FALLBACK16-NEXT: popl %esi
-; FALLBACK16-NEXT: popl %edi
-; FALLBACK16-NEXT: popl %ebx
-; FALLBACK16-NEXT: popl %ebp
-; FALLBACK16-NEXT: retl
-;
-; FALLBACK17-LABEL: ashr_64bytes:
-; FALLBACK17: # %bb.0:
-; FALLBACK17-NEXT: pushl %ebp
-; FALLBACK17-NEXT: pushl %ebx
-; FALLBACK17-NEXT: pushl %edi
-; FALLBACK17-NEXT: pushl %esi
-; FALLBACK17-NEXT: subl $188, %esp
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK17-NEXT: movl (%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 4(%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 8(%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 12(%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 16(%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 20(%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 24(%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 28(%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 32(%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 36(%eax), %ecx
-; FALLBACK17-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: movl 40(%eax), %ebp
-; FALLBACK17-NEXT: movl 44(%eax), %ebx
-; FALLBACK17-NEXT: movl 48(%eax), %edi
-; FALLBACK17-NEXT: movl 52(%eax), %esi
-; FALLBACK17-NEXT: movl 56(%eax), %edx
-; FALLBACK17-NEXT: movl 60(%eax), %eax
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK17-NEXT: movl (%ecx), %ecx
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: sarl $31, %eax
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT: movl %ecx, %ebp
-; FALLBACK17-NEXT: andl $60, %ebp
-; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shll $3, %ecx
-; FALLBACK17-NEXT: andl $24, %ecx
-; FALLBACK17-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %esi
-; FALLBACK17-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edx
-; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edx
-; FALLBACK17-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi
-; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edx
-; FALLBACK17-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl %esi, %edx
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edi
-; FALLBACK17-NEXT: shrdl %cl, %esi, %edi
-; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx
-; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax
-; FALLBACK17-NEXT: movl %eax, %edi
-; FALLBACK17-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK17-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx
-; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax
-; FALLBACK17-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK17-NEXT: movl %edx, 56(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK17-NEXT: sarl %cl, %eax
-; FALLBACK17-NEXT: movl %eax, 60(%ebp)
-; FALLBACK17-NEXT: movl %esi, 48(%ebp)
-; FALLBACK17-NEXT: movl %edi, 52(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 40(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 44(%ebp)
-; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 32(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 36(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 24(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 28(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 16(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 20(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 8(%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 12(%ebp)
-; FALLBACK17-NEXT: movl %ebx, (%ebp)
-; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT: movl %eax, 4(%ebp)
-; FALLBACK17-NEXT: addl $188, %esp
-; FALLBACK17-NEXT: popl %esi
-; FALLBACK17-NEXT: popl %edi
-; FALLBACK17-NEXT: popl %ebx
-; FALLBACK17-NEXT: popl %ebp
-; FALLBACK17-NEXT: retl
-;
-; FALLBACK18-LABEL: ashr_64bytes:
-; FALLBACK18: # %bb.0:
-; FALLBACK18-NEXT: pushl %ebp
-; FALLBACK18-NEXT: pushl %ebx
-; FALLBACK18-NEXT: pushl %edi
-; FALLBACK18-NEXT: pushl %esi
-; FALLBACK18-NEXT: subl $204, %esp
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl (%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 4(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 8(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 12(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 16(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 20(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 24(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 28(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 32(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 36(%eax), %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 40(%eax), %ebp
-; FALLBACK18-NEXT: movl 44(%eax), %ebx
-; FALLBACK18-NEXT: movl 48(%eax), %edi
-; FALLBACK18-NEXT: movl 52(%eax), %esi
-; FALLBACK18-NEXT: movl 56(%eax), %edx
-; FALLBACK18-NEXT: movl 60(%eax), %ecx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl (%eax), %eax
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: sarl $31, %ecx
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK18-NEXT: movl %eax, %ecx
-; FALLBACK18-NEXT: leal (,%eax,8), %edx
-; FALLBACK18-NEXT: andl $24, %edx
-; FALLBACK18-NEXT: andl $60, %ecx
-; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi
-; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi
-; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, %esi, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl %edx, %ebx
-; FALLBACK18-NEXT: notb %bl
-; FALLBACK18-NEXT: leal (%edi,%edi), %ebp
-; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax
-; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
-; FALLBACK18-NEXT: addl %esi, %esi
-; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK18-NEXT: orl %edi, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi
-; FALLBACK18-NEXT: leal (%esi,%esi), %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi
-; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK18-NEXT: orl %eax, %edi
-; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: leal (%eax,%eax), %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi
-; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: orl %esi, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi
-; FALLBACK18-NEXT: leal (%esi,%esi), %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi
-; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK18-NEXT: orl %eax, %edi
-; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: leal (%eax,%eax), %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi
-; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK18-NEXT: addl %edi, %edi
-; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK18-NEXT: orl %esi, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: leal (%eax,%eax), %esi
-; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi
-; FALLBACK18-NEXT: movl %ecx, %edi
-; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK18-NEXT: orl %ebp, %eax
-; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; FALLBACK18-NEXT: addl %esi, %esi
-; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK18-NEXT: orl %ecx, %esi
-; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp
-; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx
-; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx
-; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax
-; FALLBACK18-NEXT: shrxl %edx, %eax, %edi
-; FALLBACK18-NEXT: orl %edi, %ecx
-; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK18-NEXT: addl %eax, %eax
-; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi
-; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp
-; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx
-; FALLBACK18-NEXT: addl %ebp, %ebp
-; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx
-; FALLBACK18-NEXT: orl %eax, %ebx
-; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK18-NEXT: movl %edx, 60(%eax)
-; FALLBACK18-NEXT: movl %ebx, 56(%eax)
-; FALLBACK18-NEXT: movl %edi, 48(%eax)
-; FALLBACK18-NEXT: movl %ecx, 52(%eax)
-; FALLBACK18-NEXT: movl %esi, 40(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 44(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 32(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 36(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 24(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 28(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 16(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 20(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 8(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 12(%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, (%eax)
-; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK18-NEXT: movl %ecx, 4(%eax)
-; FALLBACK18-NEXT: addl $204, %esp
-; FALLBACK18-NEXT: popl %esi
-; FALLBACK18-NEXT: popl %edi
-; FALLBACK18-NEXT: popl %ebx
-; FALLBACK18-NEXT: popl %ebp
-; FALLBACK18-NEXT: retl
-;
-; FALLBACK19-LABEL: ashr_64bytes:
-; FALLBACK19: # %bb.0:
-; FALLBACK19-NEXT: pushl %ebp
-; FALLBACK19-NEXT: pushl %ebx
-; FALLBACK19-NEXT: pushl %edi
-; FALLBACK19-NEXT: pushl %esi
-; FALLBACK19-NEXT: subl $188, %esp
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK19-NEXT: movl (%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 4(%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 8(%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 12(%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 16(%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 20(%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 24(%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 28(%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 32(%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 36(%eax), %ecx
-; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: movl 40(%eax), %ebp
-; FALLBACK19-NEXT: movl 44(%eax), %ebx
-; FALLBACK19-NEXT: movl 48(%eax), %edi
-; FALLBACK19-NEXT: movl 52(%eax), %esi
-; FALLBACK19-NEXT: movl 56(%eax), %edx
-; FALLBACK19-NEXT: movl 60(%eax), %eax
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK19-NEXT: movl (%ecx), %ecx
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: sarl $31, %eax
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK19-NEXT: movl %ecx, %ebp
-; FALLBACK19-NEXT: andl $60, %ebp
-; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shll $3, %ecx
-; FALLBACK19-NEXT: andl $24, %ecx
-; FALLBACK19-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %esi
-; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx
-; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill
-; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax
-; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi
-; FALLBACK19-NEXT: movl %edi, %edx
-; FALLBACK19-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK19-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi
-; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp
-; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax
-; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK19-NEXT: movl %eax, 56(%ebp)
-; FALLBACK19-NEXT: movl %esi, 48(%ebp)
-; FALLBACK19-NEXT: movl %edx, 52(%ebp)
-; FALLBACK19-NEXT: movl %ebx, 40(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 44(%ebp)
-; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 32(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 36(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 24(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 28(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 16(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 20(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 8(%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK19-NEXT: movl %eax, 12(%ebp)
-; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK19-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK19-NEXT: movl %edi, (%ebp)
-; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK19-NEXT: movl %ecx, 4(%ebp)
-; FALLBACK19-NEXT: movl %eax, 60(%ebp)
-; FALLBACK19-NEXT: addl $188, %esp
-; FALLBACK19-NEXT: popl %esi
-; FALLBACK19-NEXT: popl %edi
-; FALLBACK19-NEXT: popl %ebx
-; FALLBACK19-NEXT: popl %ebp
-; FALLBACK19-NEXT: retl
-;
-; FALLBACK20-LABEL: ashr_64bytes:
-; FALLBACK20: # %bb.0:
-; FALLBACK20-NEXT: pushl %ebp
-; FALLBACK20-NEXT: pushl %ebx
-; FALLBACK20-NEXT: pushl %edi
-; FALLBACK20-NEXT: pushl %esi
-; FALLBACK20-NEXT: subl $204, %esp
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT: movups (%ecx), %xmm0
-; FALLBACK20-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK20-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK20-NEXT: movl 48(%ecx), %edx
-; FALLBACK20-NEXT: movl 52(%ecx), %esi
-; FALLBACK20-NEXT: movl 56(%ecx), %edi
-; FALLBACK20-NEXT: movl 60(%ecx), %ecx
-; FALLBACK20-NEXT: movl (%eax), %eax
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: sarl $31, %ecx
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT: movl %eax, %esi
-; FALLBACK20-NEXT: andl $60, %esi
-; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx
-; FALLBACK20-NEXT: shll $3, %eax
-; FALLBACK20-NEXT: andl $24, %eax
-; FALLBACK20-NEXT: movl %edx, %edi
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx
-; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT: movb %al, %ch
-; FALLBACK20-NEXT: notb %ch
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %edi, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: addl %edx, %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %edx
-; FALLBACK20-NEXT: orl %edi, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx
-; FALLBACK20-NEXT: movl %edx, %ebp
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi
-; FALLBACK20-NEXT: leal (%edi,%edi), %ebx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %ebp, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: addl %edx, %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %edx
-; FALLBACK20-NEXT: orl %ebx, %edx
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %ebp
-; FALLBACK20-NEXT: movl %eax, %edx
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: addl %eax, %eax
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %eax
-; FALLBACK20-NEXT: orl %ebp, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: addl %ebx, %ebx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %edi, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %ebp
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi
-; FALLBACK20-NEXT: leal (%edi,%edi), %eax
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %eax
-; FALLBACK20-NEXT: orl %ebp, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: addl %ebx, %ebx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %eax, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx
-; FALLBACK20-NEXT: movl %ebx, %ebp
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx
-; FALLBACK20-NEXT: leal (%edx,%edx), %eax
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %eax
-; FALLBACK20-NEXT: orl %ebp, %eax
-; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %edi
-; FALLBACK20-NEXT: addl %ebx, %ebx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %edi, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi
-; FALLBACK20-NEXT: movl %edi, %ebp
-; FALLBACK20-NEXT: movl %eax, %ecx
-; FALLBACK20-NEXT: shrl %cl, %ebp
-; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx
-; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebx
-; FALLBACK20-NEXT: orl %ebp, %ebx
-; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %edx
-; FALLBACK20-NEXT: addl %edi, %edi
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %edi
-; FALLBACK20-NEXT: orl %edx, %edi
-; FALLBACK20-NEXT: movl %esi, %edx
-; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi
-; FALLBACK20-NEXT: movl %esi, %ebx
-; FALLBACK20-NEXT: movb %al, %cl
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax
-; FALLBACK20-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %ebp
-; FALLBACK20-NEXT: orl %ebx, %ebp
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT: shrl %cl, %ebx
-; FALLBACK20-NEXT: addl %esi, %esi
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %esi
-; FALLBACK20-NEXT: orl %ebx, %esi
-; FALLBACK20-NEXT: movb %dl, %cl
-; FALLBACK20-NEXT: shrl %cl, %eax
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx
-; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx
-; FALLBACK20-NEXT: movb %ch, %cl
-; FALLBACK20-NEXT: shll %cl, %edx
-; FALLBACK20-NEXT: orl %eax, %edx
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK20-NEXT: sarl %cl, %ebx
-; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK20-NEXT: movl %ebx, 60(%eax)
-; FALLBACK20-NEXT: movl %edx, 56(%eax)
-; FALLBACK20-NEXT: movl %esi, 48(%eax)
-; FALLBACK20-NEXT: movl %ebp, 52(%eax)
-; FALLBACK20-NEXT: movl %edi, 40(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 44(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 32(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 36(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 24(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 28(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 16(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 20(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 8(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 12(%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, (%eax)
-; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT: movl %ecx, 4(%eax)
-; FALLBACK20-NEXT: addl $204, %esp
-; FALLBACK20-NEXT: popl %esi
-; FALLBACK20-NEXT: popl %edi
-; FALLBACK20-NEXT: popl %ebx
-; FALLBACK20-NEXT: popl %ebp
-; FALLBACK20-NEXT: retl
-;
-; FALLBACK21-LABEL: ashr_64bytes:
-; FALLBACK21: # %bb.0:
-; FALLBACK21-NEXT: pushl %ebp
-; FALLBACK21-NEXT: pushl %ebx
-; FALLBACK21-NEXT: pushl %edi
-; FALLBACK21-NEXT: pushl %esi
-; FALLBACK21-NEXT: subl $188, %esp
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK21-NEXT: movups (%eax), %xmm0
-; FALLBACK21-NEXT: movups 16(%eax), %xmm1
-; FALLBACK21-NEXT: movups 32(%eax), %xmm2
-; FALLBACK21-NEXT: movl 48(%eax), %edx
-; FALLBACK21-NEXT: movl 52(%eax), %esi
-; FALLBACK21-NEXT: movl 56(%eax), %edi
-; FALLBACK21-NEXT: movl 60(%eax), %eax
-; FALLBACK21-NEXT: movl (%ecx), %ecx
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: sarl $31, %eax
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT: movl %ecx, %ebp
-; FALLBACK21-NEXT: andl $60, %ebp
-; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shll $3, %ecx
-; FALLBACK21-NEXT: andl $24, %ecx
-; FALLBACK21-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %esi
-; FALLBACK21-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi
-; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edx
-; FALLBACK21-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl %esi, %edx
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edi
-; FALLBACK21-NEXT: shrdl %cl, %esi, %edi
-; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx
-; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax
-; FALLBACK21-NEXT: movl %eax, %edi
-; FALLBACK21-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK21-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx
-; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax
-; FALLBACK21-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK21-NEXT: movl %edx, 56(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK21-NEXT: sarl %cl, %eax
-; FALLBACK21-NEXT: movl %eax, 60(%ebp)
-; FALLBACK21-NEXT: movl %esi, 48(%ebp)
-; FALLBACK21-NEXT: movl %edi, 52(%ebp)
-; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 40(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 44(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 32(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 36(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 24(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 28(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 16(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 20(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 8(%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 12(%ebp)
-; FALLBACK21-NEXT: movl %ebx, (%ebp)
-; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT: movl %eax, 4(%ebp)
-; FALLBACK21-NEXT: addl $188, %esp
-; FALLBACK21-NEXT: popl %esi
-; FALLBACK21-NEXT: popl %edi
-; FALLBACK21-NEXT: popl %ebx
-; FALLBACK21-NEXT: popl %ebp
-; FALLBACK21-NEXT: retl
-;
-; FALLBACK22-LABEL: ashr_64bytes:
-; FALLBACK22: # %bb.0:
-; FALLBACK22-NEXT: pushl %ebp
-; FALLBACK22-NEXT: pushl %ebx
-; FALLBACK22-NEXT: pushl %edi
-; FALLBACK22-NEXT: pushl %esi
-; FALLBACK22-NEXT: subl $204, %esp
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK22-NEXT: movups (%ecx), %xmm0
-; FALLBACK22-NEXT: movups 16(%ecx), %xmm1
-; FALLBACK22-NEXT: movups 32(%ecx), %xmm2
-; FALLBACK22-NEXT: movl 48(%ecx), %edx
-; FALLBACK22-NEXT: movl 52(%ecx), %esi
-; FALLBACK22-NEXT: movl 56(%ecx), %edi
-; FALLBACK22-NEXT: movl 60(%ecx), %ecx
-; FALLBACK22-NEXT: movl (%eax), %eax
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: sarl $31, %ecx
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT: movl %eax, %ecx
-; FALLBACK22-NEXT: leal (,%eax,8), %edx
-; FALLBACK22-NEXT: andl $24, %edx
-; FALLBACK22-NEXT: andl $60, %ecx
-; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi
-; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi
-; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, %esi, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl %edx, %ebx
-; FALLBACK22-NEXT: notb %bl
-; FALLBACK22-NEXT: leal (%edi,%edi), %ebp
-; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax
-; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
-; FALLBACK22-NEXT: addl %esi, %esi
-; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK22-NEXT: orl %edi, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi
-; FALLBACK22-NEXT: leal (%esi,%esi), %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi
-; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK22-NEXT: orl %eax, %edi
-; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: leal (%eax,%eax), %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi
-; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: orl %esi, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi
-; FALLBACK22-NEXT: leal (%esi,%esi), %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi
-; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK22-NEXT: orl %eax, %edi
-; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: leal (%eax,%eax), %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi
-; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK22-NEXT: addl %edi, %edi
-; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT: orl %esi, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: leal (%eax,%eax), %esi
-; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi
-; FALLBACK22-NEXT: movl %ecx, %edi
-; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK22-NEXT: orl %ebp, %eax
-; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; FALLBACK22-NEXT: addl %esi, %esi
-; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT: orl %ecx, %esi
-; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp
-; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx
-; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax
-; FALLBACK22-NEXT: shrxl %edx, %eax, %edi
-; FALLBACK22-NEXT: orl %edi, %ecx
-; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT: addl %eax, %eax
-; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi
-; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp
-; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx
-; FALLBACK22-NEXT: addl %ebp, %ebp
-; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx
-; FALLBACK22-NEXT: orl %eax, %ebx
-; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT: movl %edx, 60(%eax)
-; FALLBACK22-NEXT: movl %ebx, 56(%eax)
-; FALLBACK22-NEXT: movl %edi, 48(%eax)
-; FALLBACK22-NEXT: movl %ecx, 52(%eax)
-; FALLBACK22-NEXT: movl %esi, 40(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 44(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 32(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 36(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 24(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 28(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 16(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 20(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 8(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 12(%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, (%eax)
-; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK22-NEXT: movl %ecx, 4(%eax)
-; FALLBACK22-NEXT: addl $204, %esp
-; FALLBACK22-NEXT: popl %esi
-; FALLBACK22-NEXT: popl %edi
-; FALLBACK22-NEXT: popl %ebx
-; FALLBACK22-NEXT: popl %ebp
-; FALLBACK22-NEXT: retl
-;
-; FALLBACK23-LABEL: ashr_64bytes:
-; FALLBACK23: # %bb.0:
-; FALLBACK23-NEXT: pushl %ebp
-; FALLBACK23-NEXT: pushl %ebx
-; FALLBACK23-NEXT: pushl %edi
-; FALLBACK23-NEXT: pushl %esi
-; FALLBACK23-NEXT: subl $188, %esp
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT: movups (%eax), %xmm0
-; FALLBACK23-NEXT: movups 16(%eax), %xmm1
-; FALLBACK23-NEXT: movups 32(%eax), %xmm2
-; FALLBACK23-NEXT: movl 48(%eax), %edx
-; FALLBACK23-NEXT: movl 52(%eax), %esi
-; FALLBACK23-NEXT: movl 56(%eax), %edi
-; FALLBACK23-NEXT: movl 60(%eax), %eax
-; FALLBACK23-NEXT: movl (%ecx), %ecx
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: sarl $31, %eax
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT: movl %ecx, %ebp
-; FALLBACK23-NEXT: andl $60, %ebp
-; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shll $3, %ecx
-; FALLBACK23-NEXT: andl $24, %ecx
-; FALLBACK23-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %esi
-; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx
-; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax
-; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi
-; FALLBACK23-NEXT: movl %edi, %edx
-; FALLBACK23-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK23-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi
-; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp
-; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax
-; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK23-NEXT: movl %eax, 56(%ebp)
-; FALLBACK23-NEXT: movl %esi, 48(%ebp)
-; FALLBACK23-NEXT: movl %edx, 52(%ebp)
-; FALLBACK23-NEXT: movl %ebx, 40(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 44(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 32(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 36(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 24(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 28(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 16(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 20(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 8(%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK23-NEXT: movl %eax, 12(%ebp)
-; FALLBACK23-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK23-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK23-NEXT: movl %edi, (%ebp)
-; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK23-NEXT: movl %ecx, 4(%ebp)
-; FALLBACK23-NEXT: movl %eax, 60(%ebp)
-; FALLBACK23-NEXT: addl $188, %esp
-; FALLBACK23-NEXT: popl %esi
-; FALLBACK23-NEXT: popl %edi
-; FALLBACK23-NEXT: popl %ebx
-; FALLBACK23-NEXT: popl %ebp
-; FALLBACK23-NEXT: retl
-;
-; FALLBACK24-LABEL: ashr_64bytes:
-; FALLBACK24: # %bb.0:
-; FALLBACK24-NEXT: pushl %ebp
-; FALLBACK24-NEXT: pushl %ebx
-; FALLBACK24-NEXT: pushl %edi
-; FALLBACK24-NEXT: pushl %esi
-; FALLBACK24-NEXT: subl $204, %esp
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT: vmovups 32(%ecx), %xmm1
-; FALLBACK24-NEXT: movl 48(%ecx), %edx
-; FALLBACK24-NEXT: movl 52(%ecx), %esi
-; FALLBACK24-NEXT: movl 56(%ecx), %edi
-; FALLBACK24-NEXT: movl 60(%ecx), %ecx
-; FALLBACK24-NEXT: movl (%eax), %eax
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: sarl $31, %ecx
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT: movl %eax, %esi
-; FALLBACK24-NEXT: andl $60, %esi
-; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx
-; FALLBACK24-NEXT: shll $3, %eax
-; FALLBACK24-NEXT: andl $24, %eax
-; FALLBACK24-NEXT: movl %edx, %edi
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx
-; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT: movb %al, %ch
-; FALLBACK24-NEXT: notb %ch
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %edi, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: addl %edx, %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %edx
-; FALLBACK24-NEXT: orl %edi, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx
-; FALLBACK24-NEXT: movl %edx, %ebp
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi
-; FALLBACK24-NEXT: leal (%edi,%edi), %ebx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %ebp, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: addl %edx, %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %edx
-; FALLBACK24-NEXT: orl %ebx, %edx
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %ebp
-; FALLBACK24-NEXT: movl %eax, %edx
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: addl %eax, %eax
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %eax
-; FALLBACK24-NEXT: orl %ebp, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: addl %ebx, %ebx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %edi, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %ebp
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi
-; FALLBACK24-NEXT: leal (%edi,%edi), %eax
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %eax
-; FALLBACK24-NEXT: orl %ebp, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: addl %ebx, %ebx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %eax, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx
-; FALLBACK24-NEXT: movl %ebx, %ebp
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx
-; FALLBACK24-NEXT: leal (%edx,%edx), %eax
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %eax
-; FALLBACK24-NEXT: orl %ebp, %eax
-; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %edi
-; FALLBACK24-NEXT: addl %ebx, %ebx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %edi, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi
-; FALLBACK24-NEXT: movl %edi, %ebp
-; FALLBACK24-NEXT: movl %eax, %ecx
-; FALLBACK24-NEXT: shrl %cl, %ebp
-; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx
-; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebx
-; FALLBACK24-NEXT: orl %ebp, %ebx
-; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %edx
-; FALLBACK24-NEXT: addl %edi, %edi
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %edi
-; FALLBACK24-NEXT: orl %edx, %edi
-; FALLBACK24-NEXT: movl %esi, %edx
-; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi
-; FALLBACK24-NEXT: movl %esi, %ebx
-; FALLBACK24-NEXT: movb %al, %cl
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax
-; FALLBACK24-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %ebp
-; FALLBACK24-NEXT: orl %ebx, %ebp
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT: shrl %cl, %ebx
-; FALLBACK24-NEXT: addl %esi, %esi
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %esi
-; FALLBACK24-NEXT: orl %ebx, %esi
-; FALLBACK24-NEXT: movb %dl, %cl
-; FALLBACK24-NEXT: shrl %cl, %eax
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx
-; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx
-; FALLBACK24-NEXT: movb %ch, %cl
-; FALLBACK24-NEXT: shll %cl, %edx
-; FALLBACK24-NEXT: orl %eax, %edx
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK24-NEXT: sarl %cl, %ebx
-; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK24-NEXT: movl %ebx, 60(%eax)
-; FALLBACK24-NEXT: movl %edx, 56(%eax)
-; FALLBACK24-NEXT: movl %esi, 48(%eax)
-; FALLBACK24-NEXT: movl %ebp, 52(%eax)
-; FALLBACK24-NEXT: movl %edi, 40(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 44(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 32(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 36(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 24(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 28(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 16(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 20(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 8(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 12(%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, (%eax)
-; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT: movl %ecx, 4(%eax)
-; FALLBACK24-NEXT: addl $204, %esp
-; FALLBACK24-NEXT: popl %esi
-; FALLBACK24-NEXT: popl %edi
-; FALLBACK24-NEXT: popl %ebx
-; FALLBACK24-NEXT: popl %ebp
-; FALLBACK24-NEXT: vzeroupper
-; FALLBACK24-NEXT: retl
-;
-; FALLBACK25-LABEL: ashr_64bytes:
-; FALLBACK25: # %bb.0:
-; FALLBACK25-NEXT: pushl %ebp
-; FALLBACK25-NEXT: pushl %ebx
-; FALLBACK25-NEXT: pushl %edi
-; FALLBACK25-NEXT: pushl %esi
-; FALLBACK25-NEXT: subl $188, %esp
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK25-NEXT: vmovups (%eax), %ymm0
-; FALLBACK25-NEXT: vmovups 32(%eax), %xmm1
-; FALLBACK25-NEXT: movl 48(%eax), %edx
-; FALLBACK25-NEXT: movl 52(%eax), %esi
-; FALLBACK25-NEXT: movl 56(%eax), %edi
-; FALLBACK25-NEXT: movl 60(%eax), %eax
-; FALLBACK25-NEXT: movl (%ecx), %ecx
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: sarl $31, %eax
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT: movl %ecx, %ebp
-; FALLBACK25-NEXT: andl $60, %ebp
-; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shll $3, %ecx
-; FALLBACK25-NEXT: andl $24, %ecx
-; FALLBACK25-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %esi
-; FALLBACK25-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi
-; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edx
-; FALLBACK25-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl %esi, %edx
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edi
-; FALLBACK25-NEXT: shrdl %cl, %esi, %edi
-; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx
-; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax
-; FALLBACK25-NEXT: movl %eax, %edi
-; FALLBACK25-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK25-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx
-; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax
-; FALLBACK25-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK25-NEXT: movl %edx, 56(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK25-NEXT: sarl %cl, %eax
-; FALLBACK25-NEXT: movl %eax, 60(%ebp)
-; FALLBACK25-NEXT: movl %esi, 48(%ebp)
-; FALLBACK25-NEXT: movl %edi, 52(%ebp)
-; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 40(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 44(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 32(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 36(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 24(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 28(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 16(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 20(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 8(%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 12(%ebp)
-; FALLBACK25-NEXT: movl %ebx, (%ebp)
-; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT: movl %eax, 4(%ebp)
-; FALLBACK25-NEXT: addl $188, %esp
-; FALLBACK25-NEXT: popl %esi
-; FALLBACK25-NEXT: popl %edi
-; FALLBACK25-NEXT: popl %ebx
-; FALLBACK25-NEXT: popl %ebp
-; FALLBACK25-NEXT: vzeroupper
-; FALLBACK25-NEXT: retl
-;
-; FALLBACK26-LABEL: ashr_64bytes:
-; FALLBACK26: # %bb.0:
-; FALLBACK26-NEXT: pushl %ebp
-; FALLBACK26-NEXT: pushl %ebx
-; FALLBACK26-NEXT: pushl %edi
-; FALLBACK26-NEXT: pushl %esi
-; FALLBACK26-NEXT: subl $204, %esp
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK26-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK26-NEXT: vmovups 32(%ecx), %xmm1
-; FALLBACK26-NEXT: movl 48(%ecx), %edx
-; FALLBACK26-NEXT: movl 52(%ecx), %esi
-; FALLBACK26-NEXT: movl 56(%ecx), %edi
-; FALLBACK26-NEXT: movl 60(%ecx), %ecx
-; FALLBACK26-NEXT: movl (%eax), %eax
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: sarl $31, %ecx
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK26-NEXT: movl %eax, %ecx
-; FALLBACK26-NEXT: leal (,%eax,8), %edx
-; FALLBACK26-NEXT: andl $24, %edx
-; FALLBACK26-NEXT: andl $60, %ecx
-; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi
-; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi
-; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, %esi, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl %edx, %ebx
-; FALLBACK26-NEXT: notb %bl
-; FALLBACK26-NEXT: leal (%edi,%edi), %ebp
-; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax
-; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
-; FALLBACK26-NEXT: addl %esi, %esi
-; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK26-NEXT: orl %edi, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi
-; FALLBACK26-NEXT: leal (%esi,%esi), %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi
-; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK26-NEXT: orl %eax, %edi
-; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: leal (%eax,%eax), %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi
-; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: orl %esi, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi
-; FALLBACK26-NEXT: leal (%esi,%esi), %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi
-; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK26-NEXT: orl %eax, %edi
-; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: leal (%eax,%eax), %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi
-; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK26-NEXT: addl %edi, %edi
-; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK26-NEXT: orl %esi, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: leal (%eax,%eax), %esi
-; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi
-; FALLBACK26-NEXT: movl %ecx, %edi
-; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK26-NEXT: orl %ebp, %eax
-; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; FALLBACK26-NEXT: addl %esi, %esi
-; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK26-NEXT: orl %ecx, %esi
-; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp
-; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx
-; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx
-; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax
-; FALLBACK26-NEXT: shrxl %edx, %eax, %edi
-; FALLBACK26-NEXT: orl %edi, %ecx
-; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK26-NEXT: addl %eax, %eax
-; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi
-; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp
-; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx
-; FALLBACK26-NEXT: addl %ebp, %ebp
-; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx
-; FALLBACK26-NEXT: orl %eax, %ebx
-; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK26-NEXT: movl %edx, 60(%eax)
-; FALLBACK26-NEXT: movl %ebx, 56(%eax)
-; FALLBACK26-NEXT: movl %edi, 48(%eax)
-; FALLBACK26-NEXT: movl %ecx, 52(%eax)
-; FALLBACK26-NEXT: movl %esi, 40(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 44(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 32(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 36(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 24(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 28(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 16(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 20(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 8(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 12(%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, (%eax)
-; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK26-NEXT: movl %ecx, 4(%eax)
-; FALLBACK26-NEXT: addl $204, %esp
-; FALLBACK26-NEXT: popl %esi
-; FALLBACK26-NEXT: popl %edi
-; FALLBACK26-NEXT: popl %ebx
-; FALLBACK26-NEXT: popl %ebp
-; FALLBACK26-NEXT: vzeroupper
-; FALLBACK26-NEXT: retl
-;
-; FALLBACK27-LABEL: ashr_64bytes:
-; FALLBACK27: # %bb.0:
-; FALLBACK27-NEXT: pushl %ebp
-; FALLBACK27-NEXT: pushl %ebx
-; FALLBACK27-NEXT: pushl %edi
-; FALLBACK27-NEXT: pushl %esi
-; FALLBACK27-NEXT: subl $188, %esp
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK27-NEXT: vmovups (%eax), %ymm0
-; FALLBACK27-NEXT: vmovups 32(%eax), %xmm1
-; FALLBACK27-NEXT: movl 48(%eax), %edx
-; FALLBACK27-NEXT: movl 52(%eax), %esi
-; FALLBACK27-NEXT: movl 56(%eax), %edi
-; FALLBACK27-NEXT: movl 60(%eax), %eax
-; FALLBACK27-NEXT: movl (%ecx), %ecx
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: sarl $31, %eax
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK27-NEXT: movl %ecx, %ebp
-; FALLBACK27-NEXT: andl $60, %ebp
-; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shll $3, %ecx
-; FALLBACK27-NEXT: andl $24, %ecx
-; FALLBACK27-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %esi
-; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx
-; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax
-; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi
-; FALLBACK27-NEXT: movl %edi, %edx
-; FALLBACK27-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK27-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi
-; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp
-; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax
-; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK27-NEXT: movl %eax, 56(%ebp)
-; FALLBACK27-NEXT: movl %esi, 48(%ebp)
-; FALLBACK27-NEXT: movl %edx, 52(%ebp)
-; FALLBACK27-NEXT: movl %ebx, 40(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 44(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 32(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 36(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 24(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 28(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 16(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 20(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 8(%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK27-NEXT: movl %eax, 12(%ebp)
-; FALLBACK27-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK27-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK27-NEXT: movl %edi, (%ebp)
-; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK27-NEXT: movl %ecx, 4(%ebp)
-; FALLBACK27-NEXT: movl %eax, 60(%ebp)
-; FALLBACK27-NEXT: addl $188, %esp
-; FALLBACK27-NEXT: popl %esi
-; FALLBACK27-NEXT: popl %edi
-; FALLBACK27-NEXT: popl %ebx
-; FALLBACK27-NEXT: popl %ebp
-; FALLBACK27-NEXT: vzeroupper
-; FALLBACK27-NEXT: retl
-;
-; FALLBACK28-LABEL: ashr_64bytes:
-; FALLBACK28: # %bb.0:
-; FALLBACK28-NEXT: pushl %ebp
-; FALLBACK28-NEXT: pushl %ebx
-; FALLBACK28-NEXT: pushl %edi
-; FALLBACK28-NEXT: pushl %esi
-; FALLBACK28-NEXT: subl $204, %esp
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK28-NEXT: vmovups 32(%ecx), %xmm1
-; FALLBACK28-NEXT: movl 48(%ecx), %edx
-; FALLBACK28-NEXT: movl 52(%ecx), %esi
-; FALLBACK28-NEXT: movl 56(%ecx), %edi
-; FALLBACK28-NEXT: movl 60(%ecx), %ecx
-; FALLBACK28-NEXT: movl (%eax), %eax
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: sarl $31, %ecx
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT: movl %eax, %esi
-; FALLBACK28-NEXT: andl $60, %esi
-; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx
-; FALLBACK28-NEXT: shll $3, %eax
-; FALLBACK28-NEXT: andl $24, %eax
-; FALLBACK28-NEXT: movl %edx, %edi
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx
-; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT: movb %al, %ch
-; FALLBACK28-NEXT: notb %ch
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %edi, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: addl %edx, %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %edx
-; FALLBACK28-NEXT: orl %edi, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx
-; FALLBACK28-NEXT: movl %edx, %ebp
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi
-; FALLBACK28-NEXT: leal (%edi,%edi), %ebx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %ebp, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: addl %edx, %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %edx
-; FALLBACK28-NEXT: orl %ebx, %edx
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %ebp
-; FALLBACK28-NEXT: movl %eax, %edx
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: addl %eax, %eax
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %eax
-; FALLBACK28-NEXT: orl %ebp, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: addl %ebx, %ebx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %edi, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %ebp
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi
-; FALLBACK28-NEXT: leal (%edi,%edi), %eax
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %eax
-; FALLBACK28-NEXT: orl %ebp, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: addl %ebx, %ebx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %eax, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx
-; FALLBACK28-NEXT: movl %ebx, %ebp
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx
-; FALLBACK28-NEXT: leal (%edx,%edx), %eax
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %eax
-; FALLBACK28-NEXT: orl %ebp, %eax
-; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %edi
-; FALLBACK28-NEXT: addl %ebx, %ebx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %edi, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi
-; FALLBACK28-NEXT: movl %edi, %ebp
-; FALLBACK28-NEXT: movl %eax, %ecx
-; FALLBACK28-NEXT: shrl %cl, %ebp
-; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx
-; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebx
-; FALLBACK28-NEXT: orl %ebp, %ebx
-; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %edx
-; FALLBACK28-NEXT: addl %edi, %edi
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %edi
-; FALLBACK28-NEXT: orl %edx, %edi
-; FALLBACK28-NEXT: movl %esi, %edx
-; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi
-; FALLBACK28-NEXT: movl %esi, %ebx
-; FALLBACK28-NEXT: movb %al, %cl
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax
-; FALLBACK28-NEXT: leal (%eax,%eax), %ebp
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %ebp
-; FALLBACK28-NEXT: orl %ebx, %ebp
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT: shrl %cl, %ebx
-; FALLBACK28-NEXT: addl %esi, %esi
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %esi
-; FALLBACK28-NEXT: orl %ebx, %esi
-; FALLBACK28-NEXT: movb %dl, %cl
-; FALLBACK28-NEXT: shrl %cl, %eax
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx
-; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx
-; FALLBACK28-NEXT: movb %ch, %cl
-; FALLBACK28-NEXT: shll %cl, %edx
-; FALLBACK28-NEXT: orl %eax, %edx
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK28-NEXT: sarl %cl, %ebx
-; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK28-NEXT: movl %ebx, 60(%eax)
-; FALLBACK28-NEXT: movl %edx, 56(%eax)
-; FALLBACK28-NEXT: movl %esi, 48(%eax)
-; FALLBACK28-NEXT: movl %ebp, 52(%eax)
-; FALLBACK28-NEXT: movl %edi, 40(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 44(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 32(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 36(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 24(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 28(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 16(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 20(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 8(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 12(%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, (%eax)
-; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT: movl %ecx, 4(%eax)
-; FALLBACK28-NEXT: addl $204, %esp
-; FALLBACK28-NEXT: popl %esi
-; FALLBACK28-NEXT: popl %edi
-; FALLBACK28-NEXT: popl %ebx
-; FALLBACK28-NEXT: popl %ebp
-; FALLBACK28-NEXT: vzeroupper
-; FALLBACK28-NEXT: retl
-;
-; FALLBACK29-LABEL: ashr_64bytes:
-; FALLBACK29: # %bb.0:
-; FALLBACK29-NEXT: pushl %ebp
-; FALLBACK29-NEXT: pushl %ebx
-; FALLBACK29-NEXT: pushl %edi
-; FALLBACK29-NEXT: pushl %esi
-; FALLBACK29-NEXT: subl $188, %esp
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK29-NEXT: vmovups (%eax), %ymm0
-; FALLBACK29-NEXT: vmovups 32(%eax), %xmm1
-; FALLBACK29-NEXT: movl 48(%eax), %edx
-; FALLBACK29-NEXT: movl 52(%eax), %esi
-; FALLBACK29-NEXT: movl 56(%eax), %edi
-; FALLBACK29-NEXT: movl 60(%eax), %eax
-; FALLBACK29-NEXT: movl (%ecx), %ecx
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: sarl $31, %eax
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT: movl %ecx, %ebp
-; FALLBACK29-NEXT: andl $60, %ebp
-; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shll $3, %ecx
-; FALLBACK29-NEXT: andl $24, %ecx
-; FALLBACK29-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %esi
-; FALLBACK29-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi
-; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edx
-; FALLBACK29-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl %esi, %edx
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edi
-; FALLBACK29-NEXT: shrdl %cl, %esi, %edi
-; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill
-; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx
-; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax
-; FALLBACK29-NEXT: movl %eax, %edi
-; FALLBACK29-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK29-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx
-; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax
-; FALLBACK29-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK29-NEXT: movl %edx, 56(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx
-; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK29-NEXT: sarl %cl, %eax
-; FALLBACK29-NEXT: movl %eax, 60(%ebp)
-; FALLBACK29-NEXT: movl %esi, 48(%ebp)
-; FALLBACK29-NEXT: movl %edi, 52(%ebp)
-; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 40(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 44(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 32(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 36(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 24(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 28(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 16(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 20(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 8(%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 12(%ebp)
-; FALLBACK29-NEXT: movl %ebx, (%ebp)
-; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT: movl %eax, 4(%ebp)
-; FALLBACK29-NEXT: addl $188, %esp
-; FALLBACK29-NEXT: popl %esi
-; FALLBACK29-NEXT: popl %edi
-; FALLBACK29-NEXT: popl %ebx
-; FALLBACK29-NEXT: popl %ebp
-; FALLBACK29-NEXT: vzeroupper
-; FALLBACK29-NEXT: retl
-;
-; FALLBACK30-LABEL: ashr_64bytes:
-; FALLBACK30: # %bb.0:
-; FALLBACK30-NEXT: pushl %ebp
-; FALLBACK30-NEXT: pushl %ebx
-; FALLBACK30-NEXT: pushl %edi
-; FALLBACK30-NEXT: pushl %esi
-; FALLBACK30-NEXT: subl $204, %esp
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK30-NEXT: vmovups (%ecx), %ymm0
-; FALLBACK30-NEXT: vmovups 32(%ecx), %xmm1
-; FALLBACK30-NEXT: movl 48(%ecx), %edx
-; FALLBACK30-NEXT: movl 52(%ecx), %esi
-; FALLBACK30-NEXT: movl 56(%ecx), %edi
-; FALLBACK30-NEXT: movl 60(%ecx), %ecx
-; FALLBACK30-NEXT: movl (%eax), %eax
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: sarl $31, %ecx
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK30-NEXT: movl %eax, %ecx
-; FALLBACK30-NEXT: leal (,%eax,8), %edx
-; FALLBACK30-NEXT: andl $24, %edx
-; FALLBACK30-NEXT: andl $60, %ecx
-; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi
-; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi
-; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, %esi, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl %edx, %ebx
-; FALLBACK30-NEXT: notb %bl
-; FALLBACK30-NEXT: leal (%edi,%edi), %ebp
-; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax
-; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
-; FALLBACK30-NEXT: addl %esi, %esi
-; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK30-NEXT: orl %edi, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi
-; FALLBACK30-NEXT: leal (%esi,%esi), %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi
-; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK30-NEXT: orl %eax, %edi
-; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: leal (%eax,%eax), %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi
-; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: orl %esi, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi
-; FALLBACK30-NEXT: leal (%esi,%esi), %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi
-; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi
-; FALLBACK30-NEXT: orl %eax, %edi
-; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: leal (%eax,%eax), %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi
-; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, %esi, %esi
-; FALLBACK30-NEXT: addl %edi, %edi
-; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax
-; FALLBACK30-NEXT: orl %esi, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: leal (%eax,%eax), %esi
-; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax
-; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi
-; FALLBACK30-NEXT: movl %ecx, %edi
-; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp
-; FALLBACK30-NEXT: orl %ebp, %eax
-; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; FALLBACK30-NEXT: addl %esi, %esi
-; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi
-; FALLBACK30-NEXT: orl %ecx, %esi
-; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp
-; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx
-; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx
-; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax
-; FALLBACK30-NEXT: shrxl %edx, %eax, %edi
-; FALLBACK30-NEXT: orl %edi, %ecx
-; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK30-NEXT: addl %eax, %eax
-; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi
-; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp
-; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx
-; FALLBACK30-NEXT: addl %ebp, %ebp
-; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx
-; FALLBACK30-NEXT: orl %eax, %ebx
-; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK30-NEXT: movl %edx, 60(%eax)
-; FALLBACK30-NEXT: movl %ebx, 56(%eax)
-; FALLBACK30-NEXT: movl %edi, 48(%eax)
-; FALLBACK30-NEXT: movl %ecx, 52(%eax)
-; FALLBACK30-NEXT: movl %esi, 40(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 44(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 32(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 36(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 24(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 28(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 16(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 20(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 8(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 12(%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, (%eax)
-; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK30-NEXT: movl %ecx, 4(%eax)
-; FALLBACK30-NEXT: addl $204, %esp
-; FALLBACK30-NEXT: popl %esi
-; FALLBACK30-NEXT: popl %edi
-; FALLBACK30-NEXT: popl %ebx
-; FALLBACK30-NEXT: popl %ebp
-; FALLBACK30-NEXT: vzeroupper
-; FALLBACK30-NEXT: retl
-;
-; FALLBACK31-LABEL: ashr_64bytes:
-; FALLBACK31: # %bb.0:
-; FALLBACK31-NEXT: pushl %ebp
-; FALLBACK31-NEXT: pushl %ebx
-; FALLBACK31-NEXT: pushl %edi
-; FALLBACK31-NEXT: pushl %esi
-; FALLBACK31-NEXT: subl $188, %esp
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax
-; FALLBACK31-NEXT: vmovups (%eax), %ymm0
-; FALLBACK31-NEXT: vmovups 32(%eax), %xmm1
-; FALLBACK31-NEXT: movl 48(%eax), %edx
-; FALLBACK31-NEXT: movl 52(%eax), %esi
-; FALLBACK31-NEXT: movl 56(%eax), %edi
-; FALLBACK31-NEXT: movl 60(%eax), %eax
-; FALLBACK31-NEXT: movl (%ecx), %ecx
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: sarl $31, %eax
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK31-NEXT: movl %ecx, %ebp
-; FALLBACK31-NEXT: andl $60, %ebp
-; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx
-; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shll $3, %ecx
-; FALLBACK31-NEXT: andl $24, %ecx
-; FALLBACK31-NEXT: shrdl %cl, %edx, %eax
-; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi
-; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %esi
-; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi
-; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi
-; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %edi, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %esi
-; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx
-; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edi
-; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi
-; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %esi, %edx
-; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx
-; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax
-; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi
-; FALLBACK31-NEXT: movl %edi, %edx
-; FALLBACK31-NEXT: shrdl %cl, %eax, %edx
-; FALLBACK31-NEXT: shrdl %cl, %edi, %esi
-; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi
-; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp
-; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax
-; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK31-NEXT: movl %eax, 56(%ebp)
-; FALLBACK31-NEXT: movl %esi, 48(%ebp)
-; FALLBACK31-NEXT: movl %edx, 52(%ebp)
-; FALLBACK31-NEXT: movl %ebx, 40(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 44(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 32(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 36(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 24(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 28(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 16(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 20(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 8(%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK31-NEXT: movl %eax, 12(%ebp)
-; FALLBACK31-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK31-NEXT: shrdl %cl, %edx, %edi
-; FALLBACK31-NEXT: movl %edi, (%ebp)
-; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK31-NEXT: movl %ecx, 4(%ebp)
-; FALLBACK31-NEXT: movl %eax, 60(%ebp)
-; FALLBACK31-NEXT: addl $188, %esp
-; FALLBACK31-NEXT: popl %esi
-; FALLBACK31-NEXT: popl %edi
-; FALLBACK31-NEXT: popl %ebx
-; FALLBACK31-NEXT: popl %ebp
-; FALLBACK31-NEXT: vzeroupper
-; FALLBACK31-NEXT: retl
+; X64-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
+; X64-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %edi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rdi,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %edi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rdi), %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rbx,%rbx), %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r11, %r9
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r8, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r15, %r11
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %rbx, %r10
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rdi), %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%r13,%r13), %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r12, %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r14, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: leaq (%rdi,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: orq %r13, %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 24(%rdi), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 32(%rdi), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 48(%rdi), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarq %cl, %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r10, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r8, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, -128(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r9, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r14,%r14), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %r10, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r10, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%r15,%r15), %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r12, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxq %rsi, %r15, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leaq (%rax,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxq %rcx, %r14, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orq %rbx, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r12, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addq $8, %rsp
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq (%rdi), %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 8(%rdi), %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 16(%rdi), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 24(%rdi), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 32(%rdi), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 40(%rdi), %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 48(%rdi), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -112(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -128(%rsp,%rax), %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -120(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rdi, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -96(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -104(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r10, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -80(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -88(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r14, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r11, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r14, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %rax, %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdq %cl, %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r11, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r10, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r15, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rsi, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %r8, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes:
+; X64-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rdi), %rax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %edi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rcx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rdi,8), %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %edi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r9,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r14, %r10
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rdi), %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r13, %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %r14, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r12, %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: leaq (%rdi,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: orq %r9, %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r12, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: popq %rbp
+; X64-NO-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarq %cl, %r11
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r11, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r14,%r14), %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r8
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %rbx, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%r12,%r12), %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r13, %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r15, %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r12, %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leaq (%rax,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r14, %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxq %rsi, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxq %rcx, %r9, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orq %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r13
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%rdi), %xmm0
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%rdi), %xmm2
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxq %rcx, %r11, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
+;
+; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
+; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %edi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rcx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rdi,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %edi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r12,%r12), %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r14, %r10
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, %r13
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rdi), %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r13, %r15
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r14, %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r12, %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rdi,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r12, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
+;
+; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r14,%r14), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rbx, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r12,%r12), %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r13, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r15, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r12, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %r11, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
+;
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%ecx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ebp), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%edx), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%eax), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebp, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%edx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%esi,%esi), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 112(%esp,%ebp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ecx,%ecx), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %esi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 116(%esp,%edx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 120(%esp,%edx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%edx,%edx), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %bl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 12(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 16(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 20(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 24(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 32(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 36(%eax), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 40(%eax), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 44(%eax), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: sarl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (,%eax,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 112(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 120(%esp,%ebp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%edi,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 116(%esp,%ebp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl 124(%esp,%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shlxl %edx, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: shrxl %ebx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 52(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 12(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 16(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 20(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 24(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 32(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 36(%eax), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 40(%eax), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 44(%eax), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE2-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes:
+; X86-NO-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 112(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 116(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 120(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-SSE4-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups (%eax), %xmm0
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 16(%eax), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movups 32(%eax), %xmm2
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: sarl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%ecx), %xmm0
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%ecx), %xmm2
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (,%eax,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 112(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 120(%esp,%ebp), %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%edi,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 116(%esp,%ebp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl 124(%esp,%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shlxl %edx, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: shrxl %ebx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 52(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups (%eax), %xmm0
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 16(%eax), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movups 32(%eax), %xmm2
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm2, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
+;
+; X86-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
+; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 112(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 116(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 120(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%eax,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 112(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 120(%esp,%ebp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 116(%esp,%ebp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 124(%esp,%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 52(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl
+;
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 338e104..221a51e 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -712,33 +712,33 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -994,42 +994,42 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ecx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %al, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%edx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%edx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, 28(%esp,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, 28(%esp,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -1297,33 +1297,33 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, (%esp,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edi), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%esp,%edi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, (%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%edx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $44, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -1487,31 +1487,31 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_32bytes:
@@ -1761,88 +1761,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %eax, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%edi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%edi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -2040,32 +2042,32 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %cl, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: negb %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movsbq %sil, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -16(%rsp,%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rdi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -16(%rsp,%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rdi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
@@ -2319,97 +2321,101 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl (%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %dl, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, 92(%esp,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, 92(%esp,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 28(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $108, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -2610,31 +2616,31 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rcx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rsi,8), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rax, %rsi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %sil, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi,8), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %al
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -72(%rsp,%rsi,8), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi,8), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rsi,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rsi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_32bytes:
@@ -2927,60 +2933,59 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi,4), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 32(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %ebx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %eax, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 32(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi,4), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi,4), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %eax, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %ecx, %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 24(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%esi)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 24(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 20(%esi)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%esi)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3263,13 +3268,11 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: lshr_64bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -3292,65 +3295,63 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
@@ -3868,20 +3869,20 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -3906,116 +3907,117 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -4388,10 +4390,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: shl_64bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
@@ -4419,63 +4419,61 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r15, %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r15, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r12, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, -8(%rsp,%rsi), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -16(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rsi, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rbx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
-; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: shl_64bytes:
@@ -4972,33 +4970,33 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0
@@ -5011,7 +5009,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
@@ -5032,149 +5030,152 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edx), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edx), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edx), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 56(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 60(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 52(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 40(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 44(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 32(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 36(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 24(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 16(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 20(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%edx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%edx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $204, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -5534,13 +5535,11 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: ashr_64bytes:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r15
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9
@@ -5567,65 +5566,63 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r10d
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rbx, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r12
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r12,%r12), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r10, %r13, %r10
; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r12, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rax,%rax), %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r15, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 48(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r13
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbp
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
@@ -6221,33 +6218,31 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
@@ -6256,87 +6251,84 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index c3054a3..6b5c604 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -1635,22 +1635,22 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1807,40 +1807,43 @@ define void @load_16byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
@@ -1906,13 +1909,13 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: leal (,%rsi,8), %eax
; X64-BMI2-NEXT: andl $56, %eax
-; X64-BMI2-NEXT: andl $56, %esi
-; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: movl %eax, %ecx
; X64-BMI2-NEXT: notl %eax
-; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
-; X64-BMI2-NEXT: addl %esi, %esi
-; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT: addl %edi, %edi
+; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movb %cl, (%rdx)
; X64-BMI2-NEXT: popq %rax
@@ -2070,13 +2073,13 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: leal (,%rsi,8), %eax
; X64-BMI2-NEXT: andl $56, %eax
-; X64-BMI2-NEXT: andl $56, %esi
-; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: movl %eax, %ecx
; X64-BMI2-NEXT: notl %eax
-; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
-; X64-BMI2-NEXT: addl %esi, %esi
-; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT: addl %edi, %edi
+; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movw %cx, (%rdx)
; X64-BMI2-NEXT: popq %rax
@@ -2233,13 +2236,13 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X64-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-BMI2-NEXT: leal (,%rsi,8), %eax
; X64-BMI2-NEXT: andl $56, %eax
-; X64-BMI2-NEXT: andl $56, %esi
-; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
+; X64-BMI2-NEXT: movl %eax, %ecx
; X64-BMI2-NEXT: notl %eax
-; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi
-; X64-BMI2-NEXT: addl %esi, %esi
-; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax
+; X64-BMI2-NEXT: andl $56, %esi
+; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %edi
+; X64-BMI2-NEXT: addl %edi, %edi
+; X64-BMI2-NEXT: shlxq %rax, %rdi, %rax
+; X64-BMI2-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rcx
; X64-BMI2-NEXT: orl %eax, %ecx
; X64-BMI2-NEXT: movl %ecx, (%rdx)
; X64-BMI2-NEXT: popq %rax
@@ -2521,10 +2524,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
;
; X86-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X86-HAVE-BMI2-NO-SHLD: # %bb.0:
+; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $140, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -2541,25 +2545,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, (%esp,%ecx), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $128, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $140, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: retl
%init = load <32 x i8>, ptr %src, align 1
%intermediate.sroa.0.0.vec.expand = shufflevector <32 x i8> %init, <32 x i8> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -2667,21 +2672,21 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r10, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
@@ -2860,33 +2865,33 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 16(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 8(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 4(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp
@@ -3026,9 +3031,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movups 16(%rdi), %xmm1
; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2
@@ -3043,38 +3046,36 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, -128(%rsp,%rsi), %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r9, %r8
; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax
; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %rbx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rdi, %r10, %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi
; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r14
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_32byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -3304,7 +3305,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $172, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0
@@ -3320,59 +3321,60 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 32(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esp,%eax), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%esp,%eax), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esp,%eax), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%esp,%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%eax), %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %eax
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 24(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 20(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 16(%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx)
@@ -3380,7 +3382,7 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $156, %esp
+; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $172, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 84c2cc6..bed8e58 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movb %al, (%rdx)
@@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
@@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
@@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
@@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
@@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
@@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
@@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
@@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -1908,22 +1879,22 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %al
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rax,8), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rsi, %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrb $6, %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -72(%rsp,%rcx,8), %rdi
; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rax,8), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rcx,8), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rcx,8), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rcx, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx)
; X64-HAVE-BMI2-NO-SHLD-NEXT: retq
;
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_16byte_chunk_of_32byte_alloca:
@@ -2084,40 +2055,43 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %cl, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, 16(%esp,%ecx,4), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%esi,4), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebp
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%ecx,4), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi,4), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %edi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi,4), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebp, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi,4), %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ecx,4), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 8(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 4(%ecx)
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx)
; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $92, %esp
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
index 4d261a9..37620ec 100644
--- a/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -820,7 +820,7 @@ define void @infiniteloop() {
; ENABLE-NEXT: movq %rsp, %rax
; ENABLE-NEXT: addq $-16, %rax
; ENABLE-NEXT: movq %rax, %rsp
-; ENABLE-NEXT: xorl %ecx, %ecx
+; ENABLE-NEXT: xorl %ecx, %ecx
; ENABLE-NEXT: .p2align 4
; ENABLE-NEXT: LBB10_2: ## %for.body
; ENABLE-NEXT: ## =>This Inner Loop Header: Depth=1
@@ -851,8 +851,8 @@ define void @infiniteloop() {
; DISABLE-NEXT: ## %bb.1: ## %if.then
; DISABLE-NEXT: movq %rsp, %rax
; DISABLE-NEXT: addq $-16, %rax
-; DISABLE-NEXT: %rax, %rsp
-; DISABLE-NEXT: xorl %ecx, %ecx
+; DISABLE-NEXT: movq %rax, %rsp
+; DISABLE-NEXT: xorl %ecx, %ecx
; DISABLE-NEXT: .p2align 4
; DISABLE-NEXT: LBB10_2: ## %for.body
; DISABLE-NEXT: ## =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/x87-stack-pop.mir b/llvm/test/CodeGen/X86/x87-stack-pop.mir
index 1c4ffa5..73144fd 100644
--- a/llvm/test/CodeGen/X86/x87-stack-pop.mir
+++ b/llvm/test/CodeGen/X86/x87-stack-pop.mir
@@ -1,5 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=i686-- -run-pass x86-codegen -O2 -o - %s | FileCheck %s
+# RUN: llc -mtriple=i686-- -run-pass=x86-fp-stackifier -O2 -o - %s | FileCheck %s
+# RUN: llc -mtriple=i686-- -passes=x86-fp-stackifier -O2 -o - %s | FileCheck %s
---
name: func_fxam