aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/X86')
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll3
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll3
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll14
-rw-r--r--llvm/test/CodeGen/X86/GlobalISel/select-copy.mir136
-rw-r--r--llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll2
-rw-r--r--llvm/test/CodeGen/X86/avx10_2bf16-arith.ll4
-rw-r--r--llvm/test/CodeGen/X86/bittest-big-integer.ll1515
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll3
-rw-r--r--llvm/test/CodeGen/X86/call-graph-section-assembly.ll3
-rw-r--r--llvm/test/CodeGen/X86/gfni-shifts.ll75
-rw-r--r--llvm/test/CodeGen/X86/narrow-add-i64.ll94
-rw-r--r--llvm/test/CodeGen/X86/pr166534.ll124
-rw-r--r--llvm/test/CodeGen/X86/pr166744.ll66
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-ashr-512.ll40
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-lshr-512.ll22
-rw-r--r--llvm/test/CodeGen/X86/vector-shift-shl-512.ll19
-rw-r--r--llvm/test/CodeGen/X86/vpternlog.ll (renamed from llvm/test/CodeGen/X86/issue163738.ll)12
-rw-r--r--llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll185
18 files changed, 963 insertions, 1357 deletions
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
index 87059c5..6ae7b22 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 {
; CHECK-LABEL: @test_no_bitcast(
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
index 5fb2dcd..ca7c357 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) {
; CHECK-LABEL: @test_amx_load_non_O0(
diff --git a/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll
new file mode 100644
index 0000000..841c9a6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/reloc-none.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=CHECK
+
+define void @test_reloc_none() {
+; CHECK-LABEL: test_reloc_none:
+; CHECK: # %bb.0:
+; CHECK-NEXT: .Lreloc_none0:
+; CHECK-NEXT: .reloc .Lreloc_none0, BFD_RELOC_NONE, foo
+; CHECK-NEXT: retq
+ call void @llvm.reloc.none(metadata !"foo")
+ ret void
+}
+
+declare void @llvm.reloc.none(metadata)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
index 41e1b5b..5c059a4 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -1,5 +1,6 @@
-# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
-# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=i386-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86
+# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64
--- |
@@ -30,24 +31,23 @@
...
---
name: test_copy
-# ALL-LABEL: name: test_copy
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# ALL: %0:gr8 = COPY $al
-# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT: $eax = COPY %1
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; CHECK-LABEL: name: test_copy
+ ; CHECK: liveins: $eax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+ ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+ ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s8) = COPY $al
%1(s32) = G_ZEXT %0(s8)
$eax = COPY %1(s32)
@@ -56,24 +56,23 @@ body: |
...
---
name: test_copy2
-# ALL-LABEL: name: test_copy2
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
-# ALL: %0:gr8 = COPY $al
-# ALL-NEXT: %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT: $eax = COPY %1
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; CHECK-LABEL: name: test_copy2
+ ; CHECK: liveins: $eax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+ ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+ ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s8) = COPY $al
%1(s32) = G_ZEXT %0(s8)
$eax = COPY %1(s32)
@@ -82,30 +81,35 @@ body: |
...
---
name: test_copy3
-# ALL-LABEL: name: test_copy3
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] }
-# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] }
-# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr16 = COPY $ax
-# X32-NEXT: %3:gr16_abcd = COPY %0
-# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; X86-LABEL: name: test_copy3
+ ; X86: liveins: $eax
+ ; X86-NEXT: {{ $}}
+ ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+ ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]]
+ ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+ ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+ ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X86-NEXT: RET 0, implicit $eax
+ ;
+ ; X64-LABEL: name: test_copy3
+ ; X64: liveins: $eax
+ ; X64-NEXT: {{ $}}
+ ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+ ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+ ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X64-NEXT: RET 0, implicit $eax
%0(s16) = COPY $ax
%1(s8) = G_TRUNC %0(s16)
%2(s32) = G_ZEXT %1(s8)
@@ -115,27 +119,25 @@ body: |
...
---
name: test_copy4
-# ALL-LABEL: name: test_copy4
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr32 = COPY $eax
-# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT: %2:gr32 = MOVZX32rr16 %1
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax
+ ; CHECK-LABEL: name: test_copy4
+ ; CHECK: liveins: $eax
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]]
+ ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s32) = COPY $eax
%1(s16) = G_TRUNC %0(s32)
%2(s32) = G_ZEXT %1(s16)
@@ -145,30 +147,35 @@ body: |
...
---
name: test_copy5
-# ALL-LABEL: name: test_copy5
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [ ] }
-# X32-NEXT: - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [ ] }
-# X64-NEXT: - { id: 1, class: gr8, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: gr32, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr32 = COPY $edx
-# X32-NEXT: %3:gr32_abcd = COPY %0
-# X32-NEXT: %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT: %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT: %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax,$edx
+ ; X86-LABEL: name: test_copy5
+ ; X86: liveins: $eax, $edx
+ ; X86-NEXT: {{ $}}
+ ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]]
+ ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+ ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+ ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X86-NEXT: RET 0, implicit $eax
+ ;
+ ; X64-LABEL: name: test_copy5
+ ; X64: liveins: $eax, $edx
+ ; X64-NEXT: {{ $}}
+ ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+ ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+ ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+ ; X64-NEXT: RET 0, implicit $eax
%0(s32) = COPY $edx
%1(s8) = G_TRUNC %0(s32)
%2(s32) = G_ANYEXT %1(s8)
@@ -178,29 +185,26 @@ body: |
...
---
name: test_copy6
-# ALL-LABEL: name: test_copy6
alignment: 16
legalized: true
regBankSelected: true
-# ALL: registers:
-# ALL-NEXT: - { id: 0, class: gr32, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 1, class: gr16, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] }
-# ALL-NEXT: - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [ ] }
registers:
- { id: 0, class: gpr, preferred-register: '' }
- { id: 1, class: gpr, preferred-register: '' }
- { id: 2, class: gpr, preferred-register: '' }
-# ALL: %0:gr32 = COPY $edx
-# ALL-NEXT: %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT: %3:low32_addr_access_rbp = IMPLICIT_DEF
-# ALL-NEXT: %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit
-# ALL-NEXT: $eax = COPY %2
-# ALL-NEXT: RET 0, implicit $eax
body: |
bb.1 (%ir-block.0):
liveins: $eax,$edx
+ ; CHECK-LABEL: name: test_copy6
+ ; CHECK: liveins: $eax, $edx
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF
+ ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit
+ ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]]
+ ; CHECK-NEXT: RET 0, implicit $eax
%0(s32) = COPY $edx
%1(s16) = G_TRUNC %0(s32)
%2(s32) = G_ANYEXT %1(s16)
diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
index 79849a7..d9b4635 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
@@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
; X86: # %bb.0:
-; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
index 0f2c75b..01b7618 100644
--- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
+++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
@@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
; X86: # %bb.0:
-; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
@@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
; X86: # %bb.0:
-; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index c311ab8..9d31c29 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -356,41 +356,20 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %edx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB9_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl $0, %edx
-; X86-NEXT: .LBB9_2:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: notl %esi
-; X86-NEXT: notl %edx
-; X86-NEXT: je .LBB9_4
-; X86-NEXT: # %bb.3:
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB9_4:
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: andl (%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: andl $32, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%ebx,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: movl %edx, (%ebx)
-; X86-NEXT: movl %esi, 4(%ebx)
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -600,208 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $96, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movzbl 16(%ebp), %ebx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 64(%esp,%eax), %edx
-; X86-NEXT: movl 68(%esp,%eax), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: movl 72(%esp,%esi), %ebx
-; X86-NEXT: movl 76(%esp,%esi), %esi
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: shldl %cl, %ebx, %esi
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: notl %edi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: movl 36(%esp,%ecx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esp,%ecx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: andl 8(%eax), %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: notl %esi
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl 44(%esp,%eax), %eax
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: andl 12(%ecx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl 32(%esp,%eax), %edx
-; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: andl (%eax), %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: andl 4(%ecx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl 12(%ebp), %esi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl (%ecx,%eax), %eax
-; X86-NEXT: btl %esi, %eax
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %edi, 8(%ecx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: movl %edx, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: xorl %r9d, %r9d
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %r9, %rsi
-; SSE-NEXT: notq %r8
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %r9, %rax
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: andq 8(%rdi), %r8
-; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq (%rdi), %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: movl %ecx, %eax
-; SSE-NEXT: andl $96, %eax
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: movl (%rdi,%rax), %eax
-; SSE-NEXT: btl %ecx, %eax
+; SSE-NEXT: andl $96, %esi
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: movl (%rdi,%rsi), %r8d
+; SSE-NEXT: btl %ecx, %r8d
; SSE-NEXT: setae %al
-; SSE-NEXT: movq %rsi, (%rdi)
-; SSE-NEXT: movq %r8, 8(%rdi)
+; SSE-NEXT: shll %cl, %edx
+; SSE-NEXT: btrl %ecx, %r8d
+; SSE-NEXT: orl %r8d, %edx
+; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: init_eq_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %rax, %rsi
-; AVX2-NEXT: movl %edx, %edx
-; AVX2-NEXT: xorl %r8d, %r8d
-; AVX2-NEXT: shldq %cl, %rdx, %r8
-; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: shlxq %rcx, %rax, %rax
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rax, %rsi
-; AVX2-NEXT: cmovneq %r9, %rax
-; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT: cmovneq %rdx, %r8
-; AVX2-NEXT: cmovneq %r9, %rdx
-; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: andnq (%rdi), %rax, %r8
-; AVX2-NEXT: orq %rdx, %r8
-; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: andl $96, %eax
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: movl (%rdi,%rax), %eax
-; AVX2-NEXT: btl %ecx, %eax
-; AVX2-NEXT: setae %al
-; AVX2-NEXT: movq %r8, (%rdi)
-; AVX2-NEXT: movq %rsi, 8(%rdi)
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %eax
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %rax, %rsi
-; AVX512-NEXT: xorl %r8d, %r8d
-; AVX512-NEXT: shlxq %rcx, %rax, %rax
-; AVX512-NEXT: movl %edx, %edx
-; AVX512-NEXT: xorl %r9d, %r9d
-; AVX512-NEXT: shldq %cl, %rdx, %r9
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rax, %rsi
-; AVX512-NEXT: cmovneq %r8, %rax
-; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT: cmovneq %rdx, %r9
-; AVX512-NEXT: cmovneq %r8, %rdx
-; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi
-; AVX512-NEXT: orq %r9, %rsi
-; AVX512-NEXT: andnq (%rdi), %rax, %r8
-; AVX512-NEXT: orq %rdx, %r8
-; AVX512-NEXT: movl %ecx, %eax
-; AVX512-NEXT: andl $96, %eax
-; AVX512-NEXT: shrl $3, %eax
-; AVX512-NEXT: movl (%rdi,%rax), %eax
-; AVX512-NEXT: btl %ecx, %eax
-; AVX512-NEXT: setae %al
-; AVX512-NEXT: movq %r8, (%rdi)
-; AVX512-NEXT: movq %rsi, 8(%rdi)
-; AVX512-NEXT: retq
+; AVX-LABEL: init_eq_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: andl $96, %ecx
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: movl (%rdi,%rcx), %r8d
+; AVX-NEXT: btl %esi, %r8d
+; AVX-NEXT: setae %al
+; AVX-NEXT: btrl %esi, %r8d
+; AVX-NEXT: shlxl %esi, %edx, %edx
+; AVX-NEXT: orl %r8d, %edx
+; AVX-NEXT: movl %edx, (%rdi,%rcx)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -977,673 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $352, %esp # imm = 0x160
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: andl $60, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl %edx, %eax
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%eax), %esi
-; X86-NEXT: movl 60(%eax), %ebx
-; X86-NEXT: movl 52(%eax), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%eax), %edi
-; X86-NEXT: movl 44(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl 16(%ebp), %eax
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%eax), %esi
-; X86-NEXT: movl 60(%eax), %edi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: andl 60(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 52(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 56(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 48(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 52(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 44(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 48(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 40(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 44(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 36(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 40(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 32(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 36(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 28(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 32(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 24(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 28(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 20(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 24(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 20(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 12(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 16(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 8(%eax), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: andl 12(%edx), %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 4(%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: andl 8(%edx), %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: movl (%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: andl 4(%edx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: andl (%edx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%edx,%eax), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 60(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 56(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 52(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 48(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 44(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 40(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 36(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 32(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 28(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 24(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 20(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 16(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edx)
-; X86-NEXT: movl %ebx, 8(%edx)
-; X86-NEXT: movl %edi, 4(%edx)
-; X86-NEXT: movl %esi, (%edx)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%edx,%esi), %edi
+; X86-NEXT: btl %ecx, %edi
; X86-NEXT: setae %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, (%edx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i512:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $168, %rsp
-; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: movl %esi, %eax
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: andl $56, %eax
-; SSE-NEXT: negl %eax
-; SSE-NEXT: movslq %eax, %r12
-; SSE-NEXT: movq 136(%rsp,%r12), %r9
-; SSE-NEXT: movq 144(%rsp,%r12), %rax
-; SSE-NEXT: movq %rax, %rsi
-; SSE-NEXT: shldq %cl, %r9, %rsi
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 152(%rsp,%r12), %r11
-; SSE-NEXT: shldq %cl, %rax, %r11
-; SSE-NEXT: movq 120(%rsp,%r12), %r10
-; SSE-NEXT: movq 128(%rsp,%r12), %rax
-; SSE-NEXT: movq %rax, %rbx
-; SSE-NEXT: shldq %cl, %r10, %rbx
-; SSE-NEXT: shldq %cl, %rax, %r9
-; SSE-NEXT: movq 104(%rsp,%r12), %r14
-; SSE-NEXT: movq 112(%rsp,%r12), %rax
-; SSE-NEXT: movq %rax, %r15
-; SSE-NEXT: shldq %cl, %r14, %r15
-; SSE-NEXT: shldq %cl, %rax, %r10
-; SSE-NEXT: movq 96(%rsp,%r12), %rax
-; SSE-NEXT: movq %rax, %r13
-; SSE-NEXT: shlq %cl, %r13
-; SSE-NEXT: shldq %cl, %rax, %r14
-; SSE-NEXT: movl %edx, %eax
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq 8(%rsp,%r12), %r8
-; SSE-NEXT: movq 16(%rsp,%r12), %rsi
-; SSE-NEXT: movq %rsi, %rbp
-; SSE-NEXT: shldq %cl, %r8, %rbp
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: notq %rax
-; SSE-NEXT: andq 48(%rdi), %rax
-; SSE-NEXT: orq %rbp, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: notq %rbx
-; SSE-NEXT: notq %r11
-; SSE-NEXT: movq 24(%rsp,%r12), %rax
-; SSE-NEXT: shldq %cl, %rsi, %rax
-; SSE-NEXT: movq -8(%rsp,%r12), %rbp
-; SSE-NEXT: movq (%rsp,%r12), %rdx
-; SSE-NEXT: movq %rdx, %rsi
-; SSE-NEXT: shldq %cl, %rbp, %rsi
-; SSE-NEXT: andq 56(%rdi), %r11
-; SSE-NEXT: andq 32(%rdi), %rbx
-; SSE-NEXT: orq %rax, %r11
-; SSE-NEXT: orq %rsi, %rbx
-; SSE-NEXT: notq %r15
-; SSE-NEXT: shldq %cl, %rdx, %r8
-; SSE-NEXT: notq %r9
-; SSE-NEXT: andq 40(%rdi), %r9
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: movq -24(%rsp,%r12), %rax
-; SSE-NEXT: movq -16(%rsp,%r12), %rdx
-; SSE-NEXT: movq %rdx, %rsi
-; SSE-NEXT: shldq %cl, %rax, %rsi
-; SSE-NEXT: andq 16(%rdi), %r15
-; SSE-NEXT: orq %rsi, %r15
-; SSE-NEXT: shldq %cl, %rdx, %rbp
-; SSE-NEXT: notq %r10
-; SSE-NEXT: notq %r13
-; SSE-NEXT: movq -32(%rsp,%r12), %rdx
-; SSE-NEXT: movq %rdx, %rsi
-; SSE-NEXT: shlq %cl, %rsi
-; SSE-NEXT: andq 24(%rdi), %r10
-; SSE-NEXT: andq (%rdi), %r13
-; SSE-NEXT: orq %rbp, %r10
-; SSE-NEXT: orq %rsi, %r13
-; SSE-NEXT: notq %r14
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: andq 8(%rdi), %r14
-; SSE-NEXT: orq %rax, %r14
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andl $60, %eax
-; SSE-NEXT: movl (%rdi,%rax), %eax
-; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
-; SSE-NEXT: btl %ecx, %eax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: movq %rax, 48(%rdi)
-; SSE-NEXT: movq %r11, 56(%rdi)
-; SSE-NEXT: movq %rbx, 32(%rdi)
-; SSE-NEXT: movq %r9, 40(%rdi)
-; SSE-NEXT: movq %r15, 16(%rdi)
-; SSE-NEXT: movq %r10, 24(%rdi)
-; SSE-NEXT: movq %r13, (%rdi)
-; SSE-NEXT: movq %r14, 8(%rdi)
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $60, %esi
+; SSE-NEXT: movl (%rdi,%rsi), %r8d
+; SSE-NEXT: btl %ecx, %r8d
; SSE-NEXT: setae %al
-; SSE-NEXT: addq $168, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
+; SSE-NEXT: shll %cl, %edx
+; SSE-NEXT: btrl %ecx, %r8d
+; SSE-NEXT: orl %r8d, %edx
+; SSE-NEXT: movl %edx, (%rdi,%rsi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: init_eq_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $184, %rsp
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: movl %esi, %ebx
-; AVX2-NEXT: shrl $3, %ebx
-; AVX2-NEXT: movl %ebx, %eax
-; AVX2-NEXT: andl $56, %eax
-; AVX2-NEXT: negl %eax
-; AVX2-NEXT: movslq %eax, %r11
-; AVX2-NEXT: movq 128(%rsp,%r11), %r15
-; AVX2-NEXT: movq 136(%rsp,%r11), %rax
-; AVX2-NEXT: movq %rax, %rsi
-; AVX2-NEXT: shldq %cl, %r15, %rsi
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 120(%rsp,%r11), %r8
-; AVX2-NEXT: shldq %cl, %r8, %r15
-; AVX2-NEXT: movq 144(%rsp,%r11), %r14
-; AVX2-NEXT: movq 152(%rsp,%r11), %rsi
-; AVX2-NEXT: movq %rsi, %r9
-; AVX2-NEXT: shldq %cl, %r14, %r9
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rax, %r14
-; AVX2-NEXT: movq 112(%rsp,%r11), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 160(%rsp,%r11), %r13
-; AVX2-NEXT: movq 168(%rsp,%r11), %r12
-; AVX2-NEXT: shldq %cl, %r13, %r12
-; AVX2-NEXT: shldq %cl, %rsi, %r13
-; AVX2-NEXT: shldq %cl, %rax, %r8
-; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl %edx, %eax
-; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq 24(%rsp,%r11), %rbp
-; AVX2-NEXT: movq 32(%rsp,%r11), %rdx
-; AVX2-NEXT: movq %rdx, %rax
-; AVX2-NEXT: shldq %cl, %rbp, %rax
-; AVX2-NEXT: movq 40(%rsp,%r11), %r10
-; AVX2-NEXT: shldq %cl, %rdx, %r10
-; AVX2-NEXT: movq 8(%rsp,%r11), %r9
-; AVX2-NEXT: movq 16(%rsp,%r11), %rdx
-; AVX2-NEXT: movq %rdx, %r8
-; AVX2-NEXT: shldq %cl, %r9, %r8
-; AVX2-NEXT: shldq %cl, %rdx, %rbp
-; AVX2-NEXT: andnq 48(%rdi), %r13, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq -8(%rsp,%r11), %rax
-; AVX2-NEXT: movq (%rsp,%r11), %rdx
-; AVX2-NEXT: movq %rdx, %rsi
-; AVX2-NEXT: shldq %cl, %rax, %rsi
-; AVX2-NEXT: shldq %cl, %rdx, %r9
-; AVX2-NEXT: andnq 56(%rdi), %r12, %r12
-; AVX2-NEXT: andnq 32(%rdi), %r14, %r14
-; AVX2-NEXT: orq %r10, %r12
-; AVX2-NEXT: orq %r8, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX2-NEXT: andnq 40(%rdi), %rdx, %rdx
-; AVX2-NEXT: orq %rbp, %rdx
-; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: movq -16(%rsp,%r11), %r10
-; AVX2-NEXT: shlxq %rcx, %r10, %r11
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %r10, %rax
-; AVX2-NEXT: andnq 16(%rdi), %r15, %rcx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andnq 24(%rdi), %r10, %r10
-; AVX2-NEXT: orq %rsi, %rcx
-; AVX2-NEXT: orq %r9, %r10
-; AVX2-NEXT: andnq (%rdi), %r8, %rsi
-; AVX2-NEXT: orq %r11, %rsi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andnq 8(%rdi), %r8, %r8
-; AVX2-NEXT: orq %rax, %r8
-; AVX2-NEXT: andl $60, %ebx
-; AVX2-NEXT: movl (%rdi,%rbx), %eax
-; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; AVX2-NEXT: btl %r9d, %eax
-; AVX2-NEXT: movq %r13, 48(%rdi)
-; AVX2-NEXT: movq %r12, 56(%rdi)
-; AVX2-NEXT: movq %r14, 32(%rdi)
-; AVX2-NEXT: movq %rdx, 40(%rdi)
-; AVX2-NEXT: movq %rcx, 16(%rdi)
-; AVX2-NEXT: movq %r10, 24(%rdi)
-; AVX2-NEXT: movq %rsi, (%rdi)
-; AVX2-NEXT: movq %r8, 8(%rdi)
-; AVX2-NEXT: setae %al
-; AVX2-NEXT: addq $184, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: init_eq_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $168, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: movl %esi, %r10d
-; AVX512-NEXT: shrl $3, %r10d
-; AVX512-NEXT: movl %r10d, %r8d
-; AVX512-NEXT: andl $56, %r8d
-; AVX512-NEXT: negl %r8d
-; AVX512-NEXT: movslq %r8d, %r9
-; AVX512-NEXT: movq 112(%rsp,%r9), %r11
-; AVX512-NEXT: movq 120(%rsp,%r9), %r14
-; AVX512-NEXT: movq %r14, %rax
-; AVX512-NEXT: shldq %cl, %r11, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 104(%rsp,%r9), %rax
-; AVX512-NEXT: shldq %cl, %rax, %r11
-; AVX512-NEXT: movq 128(%rsp,%r9), %r15
-; AVX512-NEXT: movq 136(%rsp,%r9), %rbp
-; AVX512-NEXT: movq %rbp, %rbx
-; AVX512-NEXT: shldq %cl, %r15, %rbx
-; AVX512-NEXT: shldq %cl, %r14, %r15
-; AVX512-NEXT: movq 144(%rsp,%r9), %r13
-; AVX512-NEXT: movq 152(%rsp,%r9), %r12
-; AVX512-NEXT: shldq %cl, %r13, %r12
-; AVX512-NEXT: movq 96(%rsp,%r9), %r14
-; AVX512-NEXT: shldq %cl, %rbp, %r13
-; AVX512-NEXT: shldq %cl, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movl %edx, %edx
-; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq 8(%rsp,%r9), %r8
-; AVX512-NEXT: movq 16(%rsp,%r9), %rax
-; AVX512-NEXT: movq %rax, %rbp
-; AVX512-NEXT: shldq %cl, %r8, %rbp
-; AVX512-NEXT: andnq 48(%rdi), %r13, %r13
-; AVX512-NEXT: orq %rbp, %r13
-; AVX512-NEXT: movq 24(%rsp,%r9), %rbp
-; AVX512-NEXT: shldq %cl, %rax, %rbp
-; AVX512-NEXT: movq -8(%rsp,%r9), %rax
-; AVX512-NEXT: movq (%rsp,%r9), %rsi
-; AVX512-NEXT: movq %rsi, %rdx
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: andnq 56(%rdi), %r12, %r12
-; AVX512-NEXT: orq %rbp, %r12
-; AVX512-NEXT: andnq 32(%rdi), %r15, %r15
-; AVX512-NEXT: orq %rdx, %r15
-; AVX512-NEXT: shldq %cl, %rsi, %r8
-; AVX512-NEXT: movq -24(%rsp,%r9), %rdx
-; AVX512-NEXT: movq -16(%rsp,%r9), %rsi
-; AVX512-NEXT: movq %rsi, %rbp
-; AVX512-NEXT: shldq %cl, %rdx, %rbp
-; AVX512-NEXT: andnq 40(%rdi), %rbx, %rbx
-; AVX512-NEXT: orq %r8, %rbx
-; AVX512-NEXT: andnq 16(%rdi), %r11, %r8
-; AVX512-NEXT: orq %rbp, %r8
-; AVX512-NEXT: shlxq %rcx, %r14, %r11
-; AVX512-NEXT: movq -32(%rsp,%r9), %r9
-; AVX512-NEXT: shldq %cl, %rsi, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: andnq 24(%rdi), %rsi, %rsi
-; AVX512-NEXT: orq %rax, %rsi
-; AVX512-NEXT: shlxq %rcx, %r9, %rax
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %r9, %rdx
-; AVX512-NEXT: andnq (%rdi), %r11, %rcx
-; AVX512-NEXT: orq %rax, %rcx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andnq 8(%rdi), %rax, %rax
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: andl $60, %r10d
-; AVX512-NEXT: movl (%rdi,%r10), %edx
-; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
-; AVX512-NEXT: btl %r9d, %edx
-; AVX512-NEXT: movq %r13, 48(%rdi)
-; AVX512-NEXT: movq %r12, 56(%rdi)
-; AVX512-NEXT: movq %r15, 32(%rdi)
-; AVX512-NEXT: movq %rbx, 40(%rdi)
-; AVX512-NEXT: movq %r8, 16(%rdi)
-; AVX512-NEXT: movq %rsi, 24(%rdi)
-; AVX512-NEXT: movq %rcx, (%rdi)
-; AVX512-NEXT: movq %rax, 8(%rdi)
-; AVX512-NEXT: setae %al
-; AVX512-NEXT: addq $168, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX-LABEL: init_eq_i512:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: andl $60, %ecx
+; AVX-NEXT: movl (%rdi,%rcx), %r8d
+; AVX-NEXT: btl %esi, %r8d
+; AVX-NEXT: setae %al
+; AVX-NEXT: btrl %esi, %r8d
+; AVX-NEXT: shlxl %esi, %edx, %edx
+; AVX-NEXT: orl %r8d, %edx
+; AVX-NEXT: movl %edx, (%rdi,%rcx)
+; AVX-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -1698,115 +906,46 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_cmpz_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %esi
-; X86-NEXT: movl 36(%esp,%esi), %eax
-; X86-NEXT: movl 40(%esp,%esi), %edi
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl 32(%esp,%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esp,%esi), %esi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: xorl 12(%ecx), %esi
-; X86-NEXT: xorl 8(%ecx), %edx
-; X86-NEXT: xorl 4(%ecx), %eax
-; X86-NEXT: xorl (%ecx), %edi
-; X86-NEXT: movl %edx, 8(%ecx)
-; X86-NEXT: movl %esi, 12(%ecx)
-; X86-NEXT: movl %edi, (%ecx)
-; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: andl $96, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: xorl %edx, (%eax,%ecx)
+; X86-NEXT: movl (%eax), %ecx
+; X86-NEXT: movl 4(%eax), %edx
+; X86-NEXT: orl 12(%eax), %edx
+; X86-NEXT: orl 8(%eax), %ecx
+; X86-NEXT: orl %edx, %ecx
; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: complement_cmpz_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %rsi, %rax
-; SSE-NEXT: xorq 8(%rdi), %rdx
-; SSE-NEXT: xorq (%rdi), %rax
-; SSE-NEXT: movq %rax, (%rdi)
-; SSE-NEXT: movq %rdx, 8(%rdi)
-; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: shll %cl, %eax
+; SSE-NEXT: andl $96, %ecx
+; SSE-NEXT: shrl $3, %ecx
+; SSE-NEXT: xorl %eax, (%rdi,%rcx)
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: orq 8(%rdi), %rax
; SSE-NEXT: setne %al
; SSE-NEXT: retq
;
-; AVX2-LABEL: complement_cmpz_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: shldq %cl, %rax, %rdx
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shlxq %rcx, %rax, %rax
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rax, %rdx
-; AVX2-NEXT: cmovneq %rsi, %rax
-; AVX2-NEXT: xorq 8(%rdi), %rdx
-; AVX2-NEXT: xorq (%rdi), %rax
-; AVX2-NEXT: movq %rax, (%rdi)
-; AVX2-NEXT: movq %rdx, 8(%rdi)
-; AVX2-NEXT: orq %rdx, %rax
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: complement_cmpz_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: movl $1, %edx
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %rdx, %rsi
-; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rdx, %rsi
-; AVX512-NEXT: cmovneq %rax, %rdx
-; AVX512-NEXT: xorq 8(%rdi), %rsi
-; AVX512-NEXT: xorq (%rdi), %rdx
-; AVX512-NEXT: movq %rdx, (%rdi)
-; AVX512-NEXT: movq %rsi, 8(%rdi)
-; AVX512-NEXT: orq %rsi, %rdx
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: retq
+; AVX-LABEL: complement_cmpz_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $esi killed $esi def $rsi
+; AVX-NEXT: movl $1, %eax
+; AVX-NEXT: shlxl %esi, %eax, %eax
+; AVX-NEXT: andl $96, %esi
+; AVX-NEXT: shrl $3, %esi
+; AVX-NEXT: xorl %eax, (%rdi,%rsi)
+; AVX-NEXT: movq (%rdi), %rax
+; AVX-NEXT: orq 8(%rdi), %rax
+; AVX-NEXT: setne %al
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -1821,14 +960,171 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-LABEL: reset_multiload_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: btrl %edx, %ebx
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl %ebx, (%ecx,%esi)
+; X86-NEXT: jae .LBB22_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB22_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: retl
+;
+; X64-LABEL: reset_multiload_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %r9d
+; X64-NEXT: movl %r9d, %r8d
+; X64-NEXT: btrl %esi, %r8d
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: btl %esi, %r9d
+; X64-NEXT: jb .LBB22_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: movl (%rdx), %eax
+; X64-NEXT: .LBB22_2:
+; X64-NEXT: movl %r8d, (%rdi,%rcx)
+; X64-NEXT: retq
+ %rem = and i32 %position, 127
+ %ofs = zext nneg i32 %rem to i128
+ %bit = shl nuw i128 1, %ofs
+ %mask = xor i128 %bit, -1
+ %ld = load i128, ptr %word
+ %sel = load i32, ptr %p
+ %test = and i128 %ld, %bit
+ %res = and i128 %ld, %mask
+ %cmp = icmp eq i128 %test, 0
+ store i128 %res, ptr %word
+ %ret = select i1 %cmp, i32 %sel, i32 0
+ ret i32 %ret
+}
+
+; Multiple uses of the store chain AND stored value
+define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind {
+; X86-LABEL: chain_reset_i256:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $-2, %edi
+; X86-NEXT: roll %cl, %edi
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: andl $28, %ecx
+; X86-NEXT: andl %edi, (%esi,%ecx)
+; X86-NEXT: movl 8(%esi), %ebx
+; X86-NEXT: movl (%esi), %edi
+; X86-NEXT: movl 4(%esi), %ecx
+; X86-NEXT: movl 12(%esi), %ebp
+; X86-NEXT: orl 28(%esi), %ebp
+; X86-NEXT: orl 20(%esi), %ecx
+; X86-NEXT: orl %ebp, %ecx
+; X86-NEXT: orl 24(%esi), %ebx
+; X86-NEXT: movl 16(%esi), %ebp
+; X86-NEXT: orl %edi, %ebp
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: movl (%edx), %esi
+; X86-NEXT: movl %edi, (%edx)
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: orl %ecx, %ebp
+; X86-NEXT: jne .LBB23_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: addl %esi, %eax
+; X86-NEXT: .LBB23_2:
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; SSE-LABEL: chain_reset_i256:
+; SSE: # %bb.0:
+; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx
+; SSE-NEXT: movl $-2, %eax
+; SSE-NEXT: roll %cl, %eax
+; SSE-NEXT: shrl $3, %ecx
+; SSE-NEXT: andl $28, %ecx
+; SSE-NEXT: andl %eax, (%rdi,%rcx)
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %r8
+; SSE-NEXT: orq 24(%rdi), %r8
+; SSE-NEXT: movq 16(%rdi), %rdi
+; SSE-NEXT: orq %rcx, %rdi
+; SSE-NEXT: movl (%rsi), %eax
+; SSE-NEXT: movl %ecx, (%rsi)
+; SSE-NEXT: movl (%rdx), %ecx
+; SSE-NEXT: addl %ecx, %eax
+; SSE-NEXT: orq %r8, %rdi
+; SSE-NEXT: cmovnel %ecx, %eax
+; SSE-NEXT: retq
+;
+; AVX-LABEL: chain_reset_i256:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX-NEXT: movl $-2, %eax
+; AVX-NEXT: roll %cl, %eax
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: andl $28, %ecx
+; AVX-NEXT: andl %eax, (%rdi,%rcx)
+; AVX-NEXT: vmovdqu (%rdi), %ymm0
+; AVX-NEXT: movl (%rdi), %ecx
+; AVX-NEXT: movl (%rsi), %eax
+; AVX-NEXT: movl %ecx, (%rsi)
+; AVX-NEXT: movl (%rdx), %ecx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vptest %ymm0, %ymm0
+; AVX-NEXT: cmovnel %ecx, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+ %rem = and i32 %position, 255
+ %ofs = zext nneg i32 %rem to i256
+ %bit = shl nuw i256 1, %ofs
+ %ld0 = load i256, ptr %p0
+ %msk = xor i256 %bit, -1
+ %res = and i256 %ld0, %msk
+ store i256 %res, ptr %p0
+ %cmp = icmp ne i256 %res, 0
+ %ld1 = load i32, ptr %p1
+ %trunc = trunc i256 %res to i32
+ store i32 %trunc, ptr %p1
+ %ld2 = load i32, ptr %p2
+ %add = add i32 %ld1, %ld2
+ %sel = select i1 %cmp, i32 %ld2, i32 %add
+ ret i32 %sel
+}
+
+; BTC/BT/BTS sequence on same i128
+define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
+; X86-LABEL: sequence_i128:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: subl $144, %esp
+; X86-NEXT: movb 20(%ebp), %ch
+; X86-NEXT: movb 12(%ebp), %cl
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1842,36 +1138,80 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 40(%esp,%eax), %edx
-; X86-NEXT: movl 44(%esp,%eax), %esi
+; X86-NEXT: movl 56(%esp,%eax), %edx
+; X86-NEXT: movl 60(%esp,%eax), %esi
; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 32(%esp,%eax), %edi
-; X86-NEXT: movl 36(%esp,%eax), %ebx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %edi
+; X86-NEXT: movl 52(%esp,%eax), %ebx
; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%ebp), %eax
-; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movb %ch, %al
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 84(%esp,%eax), %edx
+; X86-NEXT: movl 88(%esp,%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl 20(%ebp), %ecx
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esp,%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 92(%esp,%eax), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: andl %ebx, 4(%eax)
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: notl %edi
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: andl $96, %ebx
-; X86-NEXT: shrl $3, %ebx
-; X86-NEXT: movl (%eax,%ebx), %ebx
-; X86-NEXT: andl %edi, (%eax)
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %esi, 12(%eax)
-; X86-NEXT: notl %edx
-; X86-NEXT: andl %edx, 8(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: btl %ecx, %ebx
-; X86-NEXT: jae .LBB22_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB22_2:
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl 8(%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl 12(%eax), %esi
+; X86-NEXT: xorl (%eax), %edi
+; X86-NEXT: xorl 4(%eax), %ebx
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: andb $96, %al
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 96(%esp,%eax), %eax
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setae %al
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl %edx, 8(%ecx)
+; X86-NEXT: movl %esi, 12(%ecx)
+; X86-NEXT: movl %edi, (%ecx)
+; X86-NEXT: movl %ebx, 4(%ecx)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -1879,96 +1219,129 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_multiload_i128:
+; SSE-LABEL: sequence_i128:
; SSE: # %bb.0:
+; SSE-NEXT: movl %ecx, %eax
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %esi
-; SSE-NEXT: xorl %r8d, %r8d
-; SSE-NEXT: shldq %cl, %rsi, %r8
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: shlq %cl, %rsi
+; SSE-NEXT: movl $1, %r8d
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shldq %cl, %r8, %rsi
+; SSE-NEXT: movl $1, %r9d
+; SSE-NEXT: shlq %cl, %r9
+; SSE-NEXT: xorl %r11d, %r11d
; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rsi, %r8
-; SSE-NEXT: cmovneq %rax, %rsi
-; SSE-NEXT: notq %r8
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movl %ecx, %r9d
-; SSE-NEXT: andl $96, %r9d
-; SSE-NEXT: shrl $3, %r9d
-; SSE-NEXT: movl (%rdi,%r9), %r9d
-; SSE-NEXT: btl %ecx, %r9d
-; SSE-NEXT: jb .LBB22_2
-; SSE-NEXT: # %bb.1:
-; SSE-NEXT: movl (%rdx), %eax
-; SSE-NEXT: .LBB22_2:
-; SSE-NEXT: andq %rsi, (%rdi)
-; SSE-NEXT: andq %r8, 8(%rdi)
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: cmovneq %r9, %rsi
+; SSE-NEXT: cmovneq %r11, %r9
+; SSE-NEXT: xorl %r10d, %r10d
+; SSE-NEXT: movl %eax, %ecx
+; SSE-NEXT: shldq %cl, %r8, %r10
+; SSE-NEXT: shlq %cl, %r8
+; SSE-NEXT: testb $64, %al
+; SSE-NEXT: cmovneq %r8, %r10
+; SSE-NEXT: cmovneq %r11, %r8
+; SSE-NEXT: xorq 8(%rdi), %rsi
+; SSE-NEXT: xorq (%rdi), %r9
+; SSE-NEXT: movl %edx, %ecx
+; SSE-NEXT: andb $32, %cl
+; SSE-NEXT: movq %r9, %rax
+; SSE-NEXT: shrdq %cl, %rsi, %rax
+; SSE-NEXT: movq %rsi, %r11
+; SSE-NEXT: shrq %cl, %r11
+; SSE-NEXT: testb $64, %dl
+; SSE-NEXT: cmoveq %rax, %r11
+; SSE-NEXT: btl %edx, %r11d
+; SSE-NEXT: setae %al
+; SSE-NEXT: orq %r10, %rsi
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: movq %r9, (%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
; SSE-NEXT: retq
;
-; AVX2-LABEL: reset_multiload_i128:
+; AVX2-LABEL: sequence_i128:
; AVX2: # %bb.0:
+; AVX2-NEXT: movl %ecx, %eax
; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %r8d
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: movl $1, %r10d
; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %r8, %r8
+; AVX2-NEXT: shldq %cl, %r10, %rsi
+; AVX2-NEXT: shlxq %rcx, %r10, %r8
; AVX2-NEXT: testb $64, %cl
; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: cmovneq %rax, %r8
-; AVX2-NEXT: notq %rsi
-; AVX2-NEXT: notq %r8
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: andl $96, %r9d
-; AVX2-NEXT: shrl $3, %r9d
-; AVX2-NEXT: movl (%rdi,%r9), %r9d
-; AVX2-NEXT: btl %ecx, %r9d
-; AVX2-NEXT: jb .LBB22_2
-; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: movl (%rdx), %eax
-; AVX2-NEXT: .LBB22_2:
-; AVX2-NEXT: andq %r8, (%rdi)
-; AVX2-NEXT: andq %rsi, 8(%rdi)
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: cmovneq %r9, %r8
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shldq %cl, %r10, %r11
+; AVX2-NEXT: shlxq %rax, %r10, %r10
+; AVX2-NEXT: testb $64, %al
+; AVX2-NEXT: cmovneq %r10, %r11
+; AVX2-NEXT: cmovneq %r9, %r10
+; AVX2-NEXT: xorq 8(%rdi), %rsi
+; AVX2-NEXT: xorq (%rdi), %r8
+; AVX2-NEXT: movl %edx, %ecx
+; AVX2-NEXT: andb $32, %cl
+; AVX2-NEXT: movq %r8, %rax
+; AVX2-NEXT: shrdq %cl, %rsi, %rax
+; AVX2-NEXT: shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT: testb $64, %dl
+; AVX2-NEXT: cmoveq %rax, %rcx
+; AVX2-NEXT: btl %edx, %ecx
+; AVX2-NEXT: setae %al
+; AVX2-NEXT: orq %r11, %rsi
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: movq %r8, (%rdi)
+; AVX2-NEXT: movq %rsi, 8(%rdi)
; AVX2-NEXT: retq
;
-; AVX512-LABEL: reset_multiload_i128:
+; AVX512-LABEL: sequence_i128:
; AVX512: # %bb.0:
+; AVX512-NEXT: movl %ecx, %eax
; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %r8d
+; AVX512-NEXT: movl $1, %r9d
; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %r8, %rsi
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: shlxq %rcx, %r8, %r8
+; AVX512-NEXT: shldq %cl, %r9, %rsi
+; AVX512-NEXT: xorl %r10d, %r10d
+; AVX512-NEXT: shlxq %rcx, %r9, %r8
; AVX512-NEXT: testb $64, %cl
; AVX512-NEXT: cmovneq %r8, %rsi
-; AVX512-NEXT: cmovneq %rax, %r8
-; AVX512-NEXT: notq %rsi
-; AVX512-NEXT: notq %r8
-; AVX512-NEXT: movl %ecx, %r9d
-; AVX512-NEXT: andl $96, %r9d
-; AVX512-NEXT: shrl $3, %r9d
-; AVX512-NEXT: movl (%rdi,%r9), %r9d
-; AVX512-NEXT: btl %ecx, %r9d
-; AVX512-NEXT: jb .LBB22_2
-; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: movl (%rdx), %eax
-; AVX512-NEXT: .LBB22_2:
-; AVX512-NEXT: andq %r8, (%rdi)
-; AVX512-NEXT: andq %rsi, 8(%rdi)
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: cmovneq %r10, %r8
+; AVX512-NEXT: xorl %r11d, %r11d
+; AVX512-NEXT: movl %eax, %ecx
+; AVX512-NEXT: shldq %cl, %r9, %r11
+; AVX512-NEXT: shlxq %rax, %r9, %r9
+; AVX512-NEXT: testb $64, %al
+; AVX512-NEXT: cmovneq %r9, %r11
+; AVX512-NEXT: cmovneq %r10, %r9
+; AVX512-NEXT: xorq 8(%rdi), %rsi
+; AVX512-NEXT: xorq (%rdi), %r8
+; AVX512-NEXT: movl %edx, %ecx
+; AVX512-NEXT: andb $32, %cl
+; AVX512-NEXT: movq %r8, %rax
+; AVX512-NEXT: shrdq %cl, %rsi, %rax
+; AVX512-NEXT: shrxq %rcx, %rsi, %rcx
+; AVX512-NEXT: testb $64, %dl
+; AVX512-NEXT: cmoveq %rax, %rcx
+; AVX512-NEXT: btl %edx, %ecx
+; AVX512-NEXT: setae %al
+; AVX512-NEXT: orq %r11, %rsi
+; AVX512-NEXT: orq %r9, %r8
+; AVX512-NEXT: movq %r8, (%rdi)
+; AVX512-NEXT: movq %rsi, 8(%rdi)
; AVX512-NEXT: retq
- %rem = and i32 %position, 127
- %ofs = zext nneg i32 %rem to i128
- %bit = shl nuw i128 1, %ofs
- %mask = xor i128 %bit, -1
+ %rem0 = and i32 %pos0, 127
+ %rem1 = and i32 %pos1, 127
+ %rem2 = and i32 %pos2, 127
+ %ofs0 = zext nneg i32 %rem0 to i128
+ %ofs1 = zext nneg i32 %rem1 to i128
+ %ofs2 = zext nneg i32 %rem2 to i128
+ %bit0 = shl nuw i128 1, %ofs0
+ %bit1 = shl nuw i128 1, %ofs1
+ %bit2 = shl nuw i128 1, %ofs2
%ld = load i128, ptr %word
- %sel = load i32, ptr %p
- %test = and i128 %ld, %bit
- %res = and i128 %ld, %mask
- %cmp = icmp eq i128 %test, 0
- store i128 %res, ptr %word
- %ret = select i1 %cmp, i32 %sel, i32 0
- ret i32 %ret
+ %res0 = xor i128 %ld, %bit0
+ %test1 = and i128 %res0, %bit1
+ %cmp1 = icmp eq i128 %test1, 0
+ %res2 = or i128 %res0, %bit2
+ store i128 %res2, ptr %word
+ ret i1 %cmp1
}
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index f36baba..ab8498d 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -14,7 +14,6 @@ entry:
}
; CHECK: _ZL10myCallbacki:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
define internal void @_ZL10myCallbacki(i32 %value) !type !2 {
entry:
%sink = alloca i32, align 4
@@ -33,6 +32,6 @@ entry:
;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
; CHECK-NEXT: .byte 1
;; Function Entry PC
-; CHECK-NEXT: .quad [[LABEL_FUNC]]
+; CHECK-NEXT: .quad _ZL10myCallbacki
;; Function type ID
; CHECK-NEXT: .quad -5212364466660467813
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index cdbad66..02d7107 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -11,7 +11,6 @@ declare !type !1 i32 @direct_bar(i8)
declare !type !2 ptr @direct_baz(ptr)
; CHECK: ball:
-; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]:
define ptr @ball() {
entry:
call void @direct_foo()
@@ -42,7 +41,7 @@ entry:
;; Flags
; CHECK-NEXT: .byte 7
;; Function Entry PC
-; CHECK-NEXT: .quad [[LABEL_FUNC]]
+; CHECK-NEXT: .quad ball
;; Function type ID -- set to 0 as no type metadata attached to function.
; CHECK-NEXT: .quad 0
;; Number of unique direct callees.
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index feac3dc..30f1874 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_shl_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_lshr_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; GFNIAVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; GFNIAVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
ret <64 x i8> %shift
@@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; GFNIAVX512BW-LABEL: var_ashr_v64i8:
; GFNIAVX512BW: # %bb.0:
-; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT: kmovq %rax, %k1
+; GFNIAVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; GFNIAVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; GFNIAVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/narrow-add-i64.ll b/llvm/test/CodeGen/X86/narrow-add-i64.ll
new file mode 100644
index 0000000..a7a54fd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/narrow-add-i64.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64
+
+define i64 @test_add_i64_i16_const(i16 %a) nounwind {
+; X86-LABEL: test_add_i64_i16_const:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl $42, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: test_add_i64_i16_const:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %eax
+; X64-NEXT: addq $42, %rax
+; X64-NEXT: retq
+ %zext_a = zext i16 %a to i64
+ %sum = add nuw nsw i64 %zext_a, 42
+ ret i64 %sum
+}
+
+; TODO: First 48 bits are all zeros so we can safely truncate to 32 bit additon
+define i64 @test_add_i64_i16_zext(i16 %a, i16 %b) nounwind {
+; X86-LABEL: test_add_i64_i16_zext:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: test_add_i64_i16_zext:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %ecx
+; X64-NEXT: movzwl %si, %eax
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: retq
+ %zext_a = zext i16 %a to i64
+ %zext_b = zext i16 %b to i64
+ %sum = add nuw nsw i64 %zext_a, %zext_b
+ ret i64 %sum
+}
+
+; Negative: Set the 32nd bit of a to force 64 bit addition, we do not truncate to 32 bit addition in this case
+define i64 @negative_test_add_i64_i16(i16 %a) nounwind {
+; X86-LABEL: negative_test_add_i64_i16:
+; X86: # %bb.0:
+; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: addl $42, %eax
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: retl
+;
+; X64-LABEL: negative_test_add_i64_i16:
+; X64: # %bb.0:
+; X64-NEXT: movzwl %di, %ecx
+; X64-NEXT: movabsq $4294967338, %rax # imm = 0x10000002A
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: retq
+ %zext_a = zext i16 %a to i64
+ %or_a = or i64 %zext_a, 4294967296
+ %sum = add nuw nsw i64 %or_a, 42
+ ret i64 %sum
+}
+
+; Negative: We don't truncate to 32 bit addition in case of sign extension
+define i64 @negative_test_add_i64_i16_sext(i16 %a, i16 %b) nounwind {
+; X86-LABEL: negative_test_add_i64_i16_sext:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: sarl $31, %edx
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: negative_test_add_i64_i16_sext:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: # kill: def $edi killed $edi def $rdi
+; X64-NEXT: movswq %di, %rcx
+; X64-NEXT: movswq %si, %rax
+; X64-NEXT: addq %rcx, %rax
+; X64-NEXT: retq
+ %sext_a = sext i16 %a to i64
+ %sext_b = sext i16 %b to i64
+ %sum = add nuw nsw i64 %sext_a, %sext_b
+ ret i64 %sum
+}
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
new file mode 100644
index 0000000..aef44cc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
+; SSE2-LABEL: pr166534:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: movq 8(%rdi), %r8
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movq (%rsi), %r9
+; SSE2-NEXT: movq 8(%rsi), %rdi
+; SSE2-NEXT: movdqu (%rsi), %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %esi
+; SSE2-NEXT: xorl %r10d, %r10d
+; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT: sete %r10b
+; SSE2-NEXT: orq %r10, (%rdx)
+; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT: jne .LBB0_2
+; SSE2-NEXT: # %bb.1: # %if.then
+; SSE2-NEXT: xorq %r9, %rax
+; SSE2-NEXT: xorq %rdi, %r8
+; SSE2-NEXT: xorl %edx, %edx
+; SSE2-NEXT: orq %rax, %r8
+; SSE2-NEXT: sete %dl
+; SSE2-NEXT: orq %rdx, (%rcx)
+; SSE2-NEXT: .LBB0_2: # %if.end
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: pr166534:
+; SSE4: # %bb.0: # %entry
+; SSE4-NEXT: movq (%rdi), %rax
+; SSE4-NEXT: movq 8(%rdi), %r8
+; SSE4-NEXT: movdqu (%rdi), %xmm0
+; SSE4-NEXT: movq (%rsi), %r9
+; SSE4-NEXT: movq 8(%rsi), %rdi
+; SSE4-NEXT: movdqu (%rsi), %xmm1
+; SSE4-NEXT: pxor %xmm0, %xmm1
+; SSE4-NEXT: xorl %esi, %esi
+; SSE4-NEXT: ptest %xmm1, %xmm1
+; SSE4-NEXT: sete %sil
+; SSE4-NEXT: orq %rsi, (%rdx)
+; SSE4-NEXT: ptest %xmm1, %xmm1
+; SSE4-NEXT: jne .LBB0_2
+; SSE4-NEXT: # %bb.1: # %if.then
+; SSE4-NEXT: xorq %r9, %rax
+; SSE4-NEXT: xorq %rdi, %r8
+; SSE4-NEXT: xorl %edx, %edx
+; SSE4-NEXT: orq %rax, %r8
+; SSE4-NEXT: sete %dl
+; SSE4-NEXT: orq %rdx, (%rcx)
+; SSE4-NEXT: .LBB0_2: # %if.end
+; SSE4-NEXT: retq
+;
+; AVX2-LABEL: pr166534:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: movq 8(%rdi), %r8
+; AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; AVX2-NEXT: movq (%rsi), %rdi
+; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: movq 8(%rsi), %rsi
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: sete %r9b
+; AVX2-NEXT: orq %r9, (%rdx)
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: jne .LBB0_2
+; AVX2-NEXT: # %bb.1: # %if.then
+; AVX2-NEXT: xorq %rdi, %rax
+; AVX2-NEXT: xorq %rsi, %r8
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: orq %rax, %r8
+; AVX2-NEXT: sete %dl
+; AVX2-NEXT: orq %rdx, (%rcx)
+; AVX2-NEXT: .LBB0_2: # %if.end
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: pr166534:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: movq (%rdi), %rax
+; AVX512-NEXT: movq 8(%rdi), %r8
+; AVX512-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512-NEXT: movq (%rsi), %r9
+; AVX512-NEXT: movq 8(%rsi), %rdi
+; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: sete %sil
+; AVX512-NEXT: orq %rsi, (%rdx)
+; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: jne .LBB0_2
+; AVX512-NEXT: # %bb.1: # %if.then
+; AVX512-NEXT: xorq %r9, %rax
+; AVX512-NEXT: xorq %rdi, %r8
+; AVX512-NEXT: xorl %edx, %edx
+; AVX512-NEXT: orq %rax, %r8
+; AVX512-NEXT: sete %dl
+; AVX512-NEXT: orq %rdx, (%rcx)
+; AVX512-NEXT: .LBB0_2: # %if.end
+; AVX512-NEXT: retq
+entry:
+ %a = load i128, ptr %pa, align 8
+ %b = load i128, ptr %pb, align 8
+ %cmp = icmp eq i128 %a, %b
+ %conv1 = zext i1 %cmp to i128
+ %c = load i128, ptr %pc, align 8
+ %or = or i128 %c, %conv1
+ store i128 %or, ptr %pc, align 8
+ br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+ %d = load i128, ptr %pd, align 8
+ %or7 = or i128 %d, %conv1
+ store i128 %or7, ptr %pd, align 8
+ br label %if.end
+
+if.end:
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/pr166744.ll b/llvm/test/CodeGen/X86/pr166744.ll
new file mode 100644
index 0000000..21b25d8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166744.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=POSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=haswell | FileCheck %s --check-prefixes=NOPOSTRA
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=NOPOSTRA
+
+; Ensure reloads are after narrowed i512 -> i32 store
+define i1 @PR166744(ptr %v, i64 %idx, i1 zeroext %b) {
+; POSTRA-LABEL: PR166744:
+; POSTRA: # %bb.0:
+; POSTRA-NEXT: movl $1029, %eax # imm = 0x405
+; POSTRA-NEXT: shlxl %esi, %edx, %edx
+; POSTRA-NEXT: bextrl %eax, %esi, %eax
+; POSTRA-NEXT: movl (%rdi,%rax,4), %ecx
+; POSTRA-NEXT: btrl %esi, %ecx
+; POSTRA-NEXT: orl %ecx, %edx
+; POSTRA-NEXT: movl %edx, (%rdi,%rax,4)
+; POSTRA-NEXT: movq 16(%rdi), %rax
+; POSTRA-NEXT: movq (%rdi), %rcx
+; POSTRA-NEXT: movq 24(%rdi), %rdx
+; POSTRA-NEXT: movq 8(%rdi), %rsi
+; POSTRA-NEXT: orq 56(%rdi), %rdx
+; POSTRA-NEXT: orq 40(%rdi), %rsi
+; POSTRA-NEXT: orq 48(%rdi), %rax
+; POSTRA-NEXT: orq 32(%rdi), %rcx
+; POSTRA-NEXT: orq %rdx, %rsi
+; POSTRA-NEXT: orq %rax, %rcx
+; POSTRA-NEXT: orq %rsi, %rcx
+; POSTRA-NEXT: setne %al
+; POSTRA-NEXT: retq
+;
+; NOPOSTRA-LABEL: PR166744:
+; NOPOSTRA: # %bb.0:
+; NOPOSTRA-NEXT: movl %esi, %eax
+; NOPOSTRA-NEXT: shrl $3, %eax
+; NOPOSTRA-NEXT: andl $60, %eax
+; NOPOSTRA-NEXT: movl (%rdi,%rax), %ecx
+; NOPOSTRA-NEXT: btrl %esi, %ecx
+; NOPOSTRA-NEXT: shlxl %esi, %edx, %edx
+; NOPOSTRA-NEXT: orl %ecx, %edx
+; NOPOSTRA-NEXT: movl %edx, (%rdi,%rax)
+; NOPOSTRA-NEXT: movq 16(%rdi), %rax
+; NOPOSTRA-NEXT: movq (%rdi), %rcx
+; NOPOSTRA-NEXT: movq 8(%rdi), %rdx
+; NOPOSTRA-NEXT: movq 24(%rdi), %rsi
+; NOPOSTRA-NEXT: orq 56(%rdi), %rsi
+; NOPOSTRA-NEXT: orq 40(%rdi), %rdx
+; NOPOSTRA-NEXT: orq 48(%rdi), %rax
+; NOPOSTRA-NEXT: orq 32(%rdi), %rcx
+; NOPOSTRA-NEXT: orq %rsi, %rdx
+; NOPOSTRA-NEXT: orq %rax, %rcx
+; NOPOSTRA-NEXT: orq %rdx, %rcx
+; NOPOSTRA-NEXT: setne %al
+; NOPOSTRA-NEXT: retq
+ %rem = and i64 %idx, 511
+ %sh_prom = zext nneg i64 %rem to i512
+ %shl = shl nuw i512 1, %sh_prom
+ %not = xor i512 %shl, -1
+ %load = load i512, ptr %v, align 8
+ %and = and i512 %load, %not
+ %conv2 = zext i1 %b to i512
+ %shl4 = shl nuw i512 %conv2, %sh_prom
+ %or = or i512 %and, %shl4
+ store i512 %or, ptr %v, align 8
+ %cmp = icmp ne i512 %or, 0
+ ret i1 %cmp
+}
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 0fb0420..aff2228 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpsraw $4, %zmm2, %zmm3
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsraw $2, %zmm2, %zmm3
-; AVX512BW-NEXT: vpaddw %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsraw $1, %zmm2, %zmm3
-; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpsraw $4, %zmm0, %zmm3
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsraw $2, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm4
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsraw $1, %zmm0, %zmm3
-; AVX512BW-NEXT: vpsllw $2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
-; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsravw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsllw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsraw $8, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT: kmovq %rax, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 103d570..4450d07 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT: vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlvw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = lshr <64 x i8> %a, %b
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index efd7429..41238ac 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
;
; AVX512BW-LABEL: var_shift_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm2
-; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
; AVX512BW-NEXT: retq
%shift = shl <64 x i8> %a, %b
ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/issue163738.ll b/llvm/test/CodeGen/X86/vpternlog.ll
index 61fe043..bd7478d 100644
--- a/llvm/test/CodeGen/X86/issue163738.ll
+++ b/llvm/test/CodeGen/X86/vpternlog.ll
@@ -11,3 +11,15 @@ define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
%and3 = xor <8 x i64> %and3.demorgan, splat (i64 -1)
ret <8 x i64> %and3
}
+
+define <8 x i64> @xorbitcast(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c) {
+; CHECK-LABEL: xorbitcast:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT: retq
+ %or1 = or <64 x i8> %a, %b
+ %or2 = or <64 x i8> %or1, %c
+ %cast = bitcast <64 x i8> %or2 to <8 x i64>
+ %xor = xor <8 x i64> %cast, splat (i64 -1)
+ ret <8 x i64> %xor
+}
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 84c2cc6..7735500 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -168,8 +168,8 @@ define void @load_2byte_chunk_of_4byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_1byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movb %al, (%rdx)
@@ -188,17 +188,15 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -215,13 +213,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
@@ -236,14 +232,11 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -260,23 +253,19 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X86-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_8byte_alloca:
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %bl, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movb %dl, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %ebx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
%byteOff.numbits = shl nuw nsw i64 %byteOff, 3
@@ -292,8 +281,8 @@ define void @load_1byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_2byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movw %ax, (%rdx)
@@ -312,17 +301,15 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -339,18 +326,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
@@ -360,14 +345,11 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -386,18 +368,16 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %si, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movw %dx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
@@ -413,8 +393,8 @@ define void @load_2byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: load_4byte_chunk_of_8byte_alloca:
; X64-NO-BMI2: # %bb.0:
-; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT: movq (%rdi), %rax
; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx
; X64-NO-BMI2-NEXT: shrq %cl, %rax
; X64-NO-BMI2-NEXT: movl %eax, (%rdx)
@@ -433,17 +413,15 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ecx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%ecx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx
@@ -460,18 +438,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-NO-BMI2-HAVE-SHLD: # %bb.0:
; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl %cl, %edx
; X86-NO-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
+; X86-NO-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-NO-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-NO-BMI2-HAVE-SHLD-NEXT: retl
;
@@ -481,14 +457,11 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-NO-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%edx), %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl
; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi
@@ -507,18 +480,16 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X86-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movd %xmm0, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %esi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%edx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edx, %edx
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: testb $32, %cl
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, (%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: cmovel %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, (%eax)
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi
; X86-HAVE-BMI2-HAVE-SHLD-NEXT: retl
%init = load <8 x i8>, ptr %src, align 1
@@ -536,8 +507,8 @@ define void @load_4byte_chunk_of_8byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -557,8 +528,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -571,8 +542,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -591,8 +562,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -698,8 +669,8 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -719,8 +690,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -733,8 +704,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -753,8 +724,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -859,8 +830,8 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -880,8 +851,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -894,8 +865,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -914,8 +885,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -1020,8 +991,8 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) nounwind {
; X64-NO-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-NO-SHLD: # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-NO-BMI2-NO-SHLD-NEXT: movq %xmm0, %rdi
@@ -1041,8 +1012,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-NO-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-NO-BMI2-HAVE-SHLD: # %bb.0:
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-NO-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi
@@ -1055,8 +1026,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
;
; X64-HAVE-BMI2-NO-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-NO-SHLD: # %bb.0:
-; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-NO-SHLD-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm1, %rax
; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %xmm0, %rcx
@@ -1075,8 +1046,8 @@ define void @load_8byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_8byte_chunk_of_16byte_alloca:
; X64-HAVE-BMI2-HAVE-SHLD: # %bb.0:
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shll $3, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movdqu (%rdi), %xmm0
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rax
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %xmm0, %rsi