aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/X86/bittest-big-integer.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/X86/bittest-big-integer.ll')
-rw-r--r--llvm/test/CodeGen/X86/bittest-big-integer.ll7137
1 files changed, 1038 insertions, 6099 deletions
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 19d751d1..06e7d47 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %edx
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB5_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: .LBB5_2:
-; X86-NEXT: andl 4(%eax), %esi
-; X86-NEXT: andl (%eax), %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: setne %al
-; X86-NEXT: popl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $32, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
; X64-LABEL: test_ne_i64:
@@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB6_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB6_2:
-; X86-NEXT: movl (%edx), %ecx
-; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: andl %esi, %ebx
-; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: andl %eax, %ebp
-; X86-NEXT: xorl %esi, %edi
-; X86-NEXT: xorl %eax, %ecx
-; X86-NEXT: orl %ebx, %ebp
-; X86-NEXT: setne %al
-; X86-NEXT: movl %ecx, (%edx)
-; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btcl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i64:
@@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %esi
-; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB7_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: .LBB7_2:
-; X86-NEXT: movl (%edx), %eax
-; X86-NEXT: movl 4(%edx), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: notl %edi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: andl %esi, %ebp
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %ecx, %edi
-; X86-NEXT: andl %eax, %esi
-; X86-NEXT: orl %ebx, %ebp
-; X86-NEXT: sete %al
-; X86-NEXT: movl %esi, (%edx)
-; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i64:
@@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: testb $32, %cl
-; X86-NEXT: je .LBB8_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: .LBB8_2:
-; X86-NEXT: movl (%edx), %ecx
-; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: andl %esi, %ebx
-; X86-NEXT: movl %ecx, %ebp
-; X86-NEXT: andl %eax, %ebp
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: orl %ebx, %ebp
-; X86-NEXT: setne %al
-; X86-NEXT: movl %ecx, (%edx)
-; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $32, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btsl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i64:
@@ -419,52 +353,47 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl $1, %eax
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: xorl %edi, %edi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: shll %cl, %esi
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: shll %cl, %eax
; X86-NEXT: testb $32, %cl
; X86-NEXT: je .LBB9_2
; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl $0, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl $0, %edx
; X86-NEXT: .LBB9_2:
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: notl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %esi
+; X86-NEXT: notl %edx
; X86-NEXT: je .LBB9_4
; X86-NEXT: # %bb.3:
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB9_4:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 4(%ecx), %ecx
-; X86-NEXT: andl %ecx, %edx
-; X86-NEXT: andl %ecx, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl (%edi), %ecx
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: andl %ecx, %ebp
-; X86-NEXT: orl %esi, %ebp
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %ebp, (%edi)
-; X86-NEXT: movl %ebx, 4(%edi)
-; X86-NEXT: sete %al
+; X86-NEXT: andl 4(%ebx), %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: andl (%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $32, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl (%ebx,%eax), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setae %al
+; X86-NEXT: movl %esi, 4(%ebx)
+; X86-NEXT: movl %edx, (%ebx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i64:
@@ -516,101 +445,25 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $48, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, (%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %esi
-; X86-NEXT: movl 24(%esp,%esi), %edi
-; X86-NEXT: movl 28(%esp,%esi), %eax
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl 16(%esp,%esi), %edx
-; X86-NEXT: movl 20(%esp,%esi), %esi
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: andl 8(%ebx), %edi
-; X86-NEXT: andl (%ebx), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: andl 12(%ebx), %eax
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $96, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
-; SSE-LABEL: test_ne_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %eax
-; SSE-NEXT: xorl %edx, %edx
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: cmovneq %rsi, %rax
-; SSE-NEXT: andq 8(%rdi), %rdx
-; SSE-NEXT: andq (%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: setne %al
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: test_ne_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %edx
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %rdx, %rsi
-; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rdx, %rsi
-; AVX2-NEXT: cmovneq %rax, %rdx
-; AVX2-NEXT: andq 8(%rdi), %rsi
-; AVX2-NEXT: andq (%rdi), %rdx
-; AVX2-NEXT: orq %rsi, %rdx
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_ne_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %eax
-; AVX512-NEXT: xorl %edx, %edx
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shlxq %rcx, %rax, %rax
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rax, %rdx
-; AVX512-NEXT: cmovneq %rsi, %rax
-; AVX512-NEXT: andq 8(%rdi), %rdx
-; AVX512-NEXT: andq (%rdi), %rax
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: retq
+; X64-LABEL: test_ne_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $96, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: movl (%rdi,%rax), %eax
+; X64-NEXT: btl %esi, %eax
+; X64-NEXT: setb %al
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -623,124 +476,33 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %esi
-; X86-NEXT: movl 60(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %edi
-; X86-NEXT: movl 52(%esp,%eax), %ebx
-; X86-NEXT: shldl %cl, %ebx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl 8(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: andl %edi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 12(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %esi, 12(%eax)
-; X86-NEXT: movl %edi, (%eax)
-; X86-NEXT: movl %ebx, 4(%eax)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btcl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: complement_ne_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %edx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rdx, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: andq %rsi, %r8
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: andq %rdx, %r9
-; SSE-NEXT: xorq %rcx, %rsi
-; SSE-NEXT: xorq %rax, %rdx
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: setne %al
-; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: movq %rsi, 8(%rdi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: complement_ne_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: movl $1, %edx
-; AVX-NEXT: xorl %esi, %esi
-; AVX-NEXT: shldq %cl, %rdx, %rsi
-; AVX-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %rdx, %rsi
-; AVX-NEXT: cmovneq %rax, %rdx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: movq %rcx, %r8
-; AVX-NEXT: andq %rsi, %r8
-; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: andq %rdx, %r9
-; AVX-NEXT: xorq %rcx, %rsi
-; AVX-NEXT: xorq %rax, %rdx
-; AVX-NEXT: orq %r8, %r9
-; AVX-NEXT: setne %al
-; AVX-NEXT: movq %rdx, (%rdi)
-; AVX-NEXT: movq %rsi, 8(%rdi)
-; AVX-NEXT: retq
+; X64-LABEL: complement_ne_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btcl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -755,124 +517,33 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %edx
-; X86-NEXT: movl 60(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %esi
-; X86-NEXT: movl 52(%esp,%eax), %edi
-; X86-NEXT: shldl %cl, %edi, %edx
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl 8(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl (%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: movl 4(%ebx), %ebx
-; X86-NEXT: andl %ebx, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl %ebx, %ecx
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl 8(%ebp), %edi
-; X86-NEXT: movl %edx, 8(%edi)
-; X86-NEXT: movl %eax, 12(%edi)
-; X86-NEXT: movl %esi, (%edi)
-; X86-NEXT: movl %ecx, 4(%edi)
-; X86-NEXT: sete %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_eq_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %edx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rdx, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: andq %rsi, %r8
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: andq %rdx, %r9
-; SSE-NEXT: notq %rdx
-; SSE-NEXT: andq %rcx, %rsi
-; SSE-NEXT: andq %rax, %rdx
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: sete %al
-; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: movq %rsi, 8(%rdi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: reset_eq_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: movl $1, %edx
-; AVX-NEXT: xorl %esi, %esi
-; AVX-NEXT: shldq %cl, %rdx, %rsi
-; AVX-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %rdx, %rsi
-; AVX-NEXT: cmovneq %rax, %rdx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: andnq %rcx, %rsi, %r8
-; AVX-NEXT: andq %rsi, %rcx
-; AVX-NEXT: andnq %rax, %rdx, %rsi
-; AVX-NEXT: andq %rdx, %rax
-; AVX-NEXT: orq %rcx, %rax
-; AVX-NEXT: sete %al
-; AVX-NEXT: movq %rsi, (%rdi)
-; AVX-NEXT: movq %r8, 8(%rdi)
-; AVX-NEXT: retq
+; X64-LABEL: reset_eq_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setae %al
+; X64-NEXT: btrl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -888,124 +559,33 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $80, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 56(%esp,%eax), %esi
-; X86-NEXT: movl 60(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esp,%eax), %edi
-; X86-NEXT: movl 52(%esp,%eax), %ebx
-; X86-NEXT: shldl %cl, %ebx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl 8(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: andl %edi, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 12(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %esi, 12(%eax)
-; X86-NEXT: movl %edi, (%eax)
-; X86-NEXT: movl %ebx, 4(%eax)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: andl $96, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btsl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: set_ne_i128:
-; SSE: # %bb.0:
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl $1, %edx
-; SSE-NEXT: xorl %esi, %esi
-; SSE-NEXT: shldq %cl, %rdx, %rsi
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: xorl %eax, %eax
-; SSE-NEXT: testb $64, %cl
-; SSE-NEXT: cmovneq %rdx, %rsi
-; SSE-NEXT: cmovneq %rax, %rdx
-; SSE-NEXT: movq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r8
-; SSE-NEXT: andq %rsi, %r8
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: andq %rdx, %r9
-; SSE-NEXT: orq %rcx, %rsi
-; SSE-NEXT: orq %rax, %rdx
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: setne %al
-; SSE-NEXT: movq %rdx, (%rdi)
-; SSE-NEXT: movq %rsi, 8(%rdi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: set_ne_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: xorl %eax, %eax
-; AVX-NEXT: movl $1, %edx
-; AVX-NEXT: xorl %esi, %esi
-; AVX-NEXT: shldq %cl, %rdx, %rsi
-; AVX-NEXT: shlxq %rcx, %rdx, %rdx
-; AVX-NEXT: testb $64, %cl
-; AVX-NEXT: cmovneq %rdx, %rsi
-; AVX-NEXT: cmovneq %rax, %rdx
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq 8(%rdi), %rcx
-; AVX-NEXT: movq %rcx, %r8
-; AVX-NEXT: andq %rsi, %r8
-; AVX-NEXT: movq %rax, %r9
-; AVX-NEXT: andq %rdx, %r9
-; AVX-NEXT: orq %rcx, %rsi
-; AVX-NEXT: orq %rax, %rdx
-; AVX-NEXT: orq %r8, %r9
-; AVX-NEXT: setne %al
-; AVX-NEXT: movq %rdx, (%rdi)
-; AVX-NEXT: movq %rsi, 8(%rdi)
-; AVX-NEXT: retq
+; X64-LABEL: set_ne_i128:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: andl $96, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btsl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -1026,9 +606,9 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $128, %esp
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: movzbl 16(%ebp), %eax
+; X86-NEXT: subl $96, %esp
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movzbl 16(%ebp), %ebx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1037,25 +617,29 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrb $3, %dl
-; X86-NEXT: andb $12, %dl
-; X86-NEXT: negb %dl
-; X86-NEXT: movsbl %dl, %esi
-; X86-NEXT: movl 64(%esp,%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 68(%esp,%esi), %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %edi
+; X86-NEXT: movl 72(%esp,%edi), %edx
+; X86-NEXT: movl 76(%esp,%edi), %esi
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%esp,%edi), %ebx
+; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; X86-NEXT: movl 68(%esp,%edi), %ebx
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: shldl %cl, %ebx, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esp,%esi), %ebx
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: notl %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 76(%esp,%esi), %edi
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shll %cl, %edx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1063,72 +647,53 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl 40(%esp,%eax), %edi
+; X86-NEXT: movl 44(%esp,%eax), %esi
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %edi, %esi
; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl 12(%ecx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl 4(%ecx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl 100(%esp,%ecx), %edi
-; X86-NEXT: movl 104(%esp,%ecx), %ecx
-; X86-NEXT: movl %ecx, %ebx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movzbl 12(%ebp), %ecx
-; X86-NEXT: shldl %cl, %edi, %ebx
-; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl 108(%esp,%ebx), %ebx
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 12(%ecx), %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl 96(%esp,%ebx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl 36(%esp,%esi), %esi
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: andl 8(%edx), %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl 32(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl 8(%ebp), %edi
+; X86-NEXT: andl 4(%edi), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl (%esp), %edx # 4-byte Reload
; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 8(%ecx)
-; X86-NEXT: movl %esi, 12(%ecx)
-; X86-NEXT: movl %eax, (%ecx)
-; X86-NEXT: movl %edx, 4(%ecx)
-; X86-NEXT: sete %al
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: andl (%edi), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $96, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl (%edi,%eax), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%edi)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 8(%edi)
+; X86-NEXT: movl %ebx, 4(%edi)
+; X86-NEXT: movl %edx, (%edi)
+; X86-NEXT: setae %al
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -1151,86 +716,84 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-NEXT: testb $64, %cl
; SSE-NEXT: cmovneq %rsi, %r8
; SSE-NEXT: cmovneq %r9, %rsi
+; SSE-NEXT: notq %r8
; SSE-NEXT: cmovneq %rax, %rdx
; SSE-NEXT: cmovneq %r9, %rax
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq 8(%rdi), %r9
-; SSE-NEXT: movq %r9, %r10
-; SSE-NEXT: andq %r8, %r10
-; SSE-NEXT: notq %r8
-; SSE-NEXT: movq %rcx, %r11
-; SSE-NEXT: andq %rsi, %r11
; SSE-NEXT: notq %rsi
-; SSE-NEXT: andq %r9, %r8
+; SSE-NEXT: andq 8(%rdi), %r8
; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq %rcx, %rsi
+; SSE-NEXT: andq (%rdi), %rsi
; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: orq %r10, %r11
-; SSE-NEXT: sete %al
-; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: andl $96, %eax
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: movl (%rdi,%rax), %eax
+; SSE-NEXT: btl %ecx, %eax
+; SSE-NEXT: setae %al
; SSE-NEXT: movq %r8, 8(%rdi)
+; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: retq
;
; AVX2-LABEL: init_eq_i128:
; AVX2: # %bb.0:
; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl $1, %esi
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: shldq %cl, %rsi, %rax
-; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: movl $1, %eax
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: shldq %cl, %rax, %rsi
; AVX2-NEXT: movl %edx, %edx
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: shldq %cl, %rdx, %r8
; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: shldq %cl, %rdx, %r9
-; AVX2-NEXT: shlxq %rcx, %rsi, %rsi
+; AVX2-NEXT: shlxq %rcx, %rax, %rax
; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %rsi, %rax
-; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT: cmovneq %rcx, %r9
-; AVX2-NEXT: cmovneq %r8, %rcx
-; AVX2-NEXT: movq (%rdi), %rdx
-; AVX2-NEXT: movq 8(%rdi), %r8
-; AVX2-NEXT: andnq %r8, %rax, %r10
-; AVX2-NEXT: andq %rax, %r8
-; AVX2-NEXT: andnq %rdx, %rsi, %r11
-; AVX2-NEXT: andq %rsi, %rdx
-; AVX2-NEXT: orq %r9, %r10
-; AVX2-NEXT: orq %rcx, %r11
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: movq %r11, (%rdi)
-; AVX2-NEXT: movq %r10, 8(%rdi)
+; AVX2-NEXT: cmovneq %rax, %rsi
+; AVX2-NEXT: cmovneq %r9, %rax
+; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX2-NEXT: cmovneq %rdx, %r8
+; AVX2-NEXT: cmovneq %r9, %rdx
+; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: andnq (%rdi), %rax, %r8
+; AVX2-NEXT: orq %rdx, %r8
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $96, %eax
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: movl (%rdi,%rax), %eax
+; AVX2-NEXT: btl %ecx, %eax
+; AVX2-NEXT: setae %al
+; AVX2-NEXT: movq %rsi, 8(%rdi)
+; AVX2-NEXT: movq %r8, (%rdi)
; AVX2-NEXT: retq
;
; AVX512-LABEL: init_eq_i128:
; AVX512: # %bb.0:
; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: movl $1, %esi
+; AVX512-NEXT: movl $1, %eax
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: shldq %cl, %rax, %rsi
; AVX512-NEXT: xorl %r8d, %r8d
-; AVX512-NEXT: shldq %cl, %rsi, %r8
-; AVX512-NEXT: shlxq %rcx, %rsi, %rsi
+; AVX512-NEXT: shlxq %rcx, %rax, %rax
; AVX512-NEXT: movl %edx, %edx
; AVX512-NEXT: xorl %r9d, %r9d
; AVX512-NEXT: shldq %cl, %rdx, %r9
; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %rsi, %r8
; AVX512-NEXT: cmovneq %rax, %rsi
-; AVX512-NEXT: shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT: cmovneq %rcx, %r9
-; AVX512-NEXT: cmovneq %rax, %rcx
-; AVX512-NEXT: movq (%rdi), %rax
-; AVX512-NEXT: movq 8(%rdi), %rdx
-; AVX512-NEXT: andnq %rdx, %r8, %r10
-; AVX512-NEXT: andq %r8, %rdx
-; AVX512-NEXT: andnq %rax, %rsi, %r8
-; AVX512-NEXT: andq %rsi, %rax
-; AVX512-NEXT: orq %r9, %r10
-; AVX512-NEXT: orq %rcx, %r8
-; AVX512-NEXT: orq %rdx, %rax
-; AVX512-NEXT: sete %al
+; AVX512-NEXT: cmovneq %r8, %rax
+; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX512-NEXT: cmovneq %rdx, %r9
+; AVX512-NEXT: cmovneq %r8, %rdx
+; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi
+; AVX512-NEXT: orq %r9, %rsi
+; AVX512-NEXT: andnq (%rdi), %rax, %r8
+; AVX512-NEXT: orq %rdx, %r8
+; AVX512-NEXT: movl %ecx, %eax
+; AVX512-NEXT: andl $96, %eax
+; AVX512-NEXT: shrl $3, %eax
+; AVX512-NEXT: movl (%rdi,%rax), %eax
+; AVX512-NEXT: btl %ecx, %eax
+; AVX512-NEXT: setae %al
+; AVX512-NEXT: movq %rsi, 8(%rdi)
; AVX512-NEXT: movq %r8, (%rdi)
-; AVX512-NEXT: movq %r10, 8(%rdi)
; AVX512-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
@@ -1252,344 +815,25 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $224, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edx), %eax
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edx), %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 52(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 4(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl 40(%ebx), %eax
-; X86-NEXT: andl 8(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 56(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 24(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: andl 44(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 12(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 60(%edi), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 28(%edi), %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%edx), %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: negl %edx
-; X86-NEXT: movl 192(%esp,%edx), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl 32(%ebx), %ecx
-; X86-NEXT: andl (%ebx), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: andl 16(%ebx), %edi
-; X86-NEXT: andl 48(%ebx), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 36(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 4(%ebx), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 20(%ebx), %ecx
-; X86-NEXT: andl 52(%ebx), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
-; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: andl $60, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
; X86-NEXT: retl
;
-; SSE-LABEL: test_ne_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rbx
-; SSE-NEXT: movq -48(%rsp,%rbx), %rdx
-; SSE-NEXT: movq -40(%rsp,%rbx), %r14
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq -16(%rsp,%rbx), %r11
-; SSE-NEXT: movq -8(%rsp,%rbx), %r10
-; SSE-NEXT: shldq %cl, %r11, %r10
-; SSE-NEXT: movq -32(%rsp,%rbx), %r9
-; SSE-NEXT: movq -24(%rsp,%rbx), %r15
-; SSE-NEXT: movq %r15, %r8
-; SSE-NEXT: shldq %cl, %r9, %r8
-; SSE-NEXT: movq -56(%rsp,%rbx), %rsi
-; SSE-NEXT: shldq %cl, %rsi, %rdx
-; SSE-NEXT: shldq %cl, %r15, %r11
-; SSE-NEXT: shldq %cl, %r14, %r9
-; SSE-NEXT: movq -64(%rsp,%rbx), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %rsi
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rbx
-; SSE-NEXT: andq 32(%rdi), %r9
-; SSE-NEXT: andq 48(%rdi), %r11
-; SSE-NEXT: andq 16(%rdi), %rdx
-; SSE-NEXT: orq %r11, %rdx
-; SSE-NEXT: andq 40(%rdi), %r8
-; SSE-NEXT: andq 56(%rdi), %r10
-; SSE-NEXT: andq 24(%rdi), %rax
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: andq (%rdi), %rbx
-; SSE-NEXT: orq %r9, %rbx
-; SSE-NEXT: orq %rdx, %rbx
-; SSE-NEXT: andq 8(%rdi), %rsi
-; SSE-NEXT: orq %r8, %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: orq %rbx, %rsi
-; SSE-NEXT: setne %al
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: test_ne_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rsi
-; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx
-; AVX2-NEXT: movq %rbx, %rax
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq -16(%rsp,%rsi), %r11
-; AVX2-NEXT: movq -8(%rsp,%rsi), %r10
-; AVX2-NEXT: shldq %cl, %r11, %r10
-; AVX2-NEXT: movq -32(%rsp,%rsi), %r9
-; AVX2-NEXT: movq -24(%rsp,%rsi), %r14
-; AVX2-NEXT: movq %r14, %r8
-; AVX2-NEXT: shldq %cl, %r9, %r8
-; AVX2-NEXT: movq -64(%rsp,%rsi), %r15
-; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi
-; AVX2-NEXT: shldq %cl, %rsi, %rdx
-; AVX2-NEXT: shldq %cl, %r14, %r11
-; AVX2-NEXT: shldq %cl, %rbx, %r9
-; AVX2-NEXT: shldq %cl, %r15, %rsi
-; AVX2-NEXT: shlxq %rcx, %r15, %rcx
-; AVX2-NEXT: andq 32(%rdi), %r9
-; AVX2-NEXT: andq 48(%rdi), %r11
-; AVX2-NEXT: andq 16(%rdi), %rdx
-; AVX2-NEXT: andq 40(%rdi), %r8
-; AVX2-NEXT: andq 56(%rdi), %r10
-; AVX2-NEXT: andq 24(%rdi), %rax
-; AVX2-NEXT: orq %r11, %rdx
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: andq (%rdi), %rcx
-; AVX2-NEXT: orq %r9, %rcx
-; AVX2-NEXT: orq %rdx, %rcx
-; AVX2-NEXT: andq 8(%rdi), %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: orq %rax, %rsi
-; AVX2-NEXT: orq %rcx, %rsi
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_ne_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx
-; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %rax
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq -16(%rsp,%rbx), %r11
-; AVX512-NEXT: movq -8(%rsp,%rbx), %r10
-; AVX512-NEXT: shldq %cl, %r11, %r10
-; AVX512-NEXT: movq -32(%rsp,%rbx), %r9
-; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT: movq %r15, %r8
-; AVX512-NEXT: shldq %cl, %r9, %r8
-; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi
-; AVX512-NEXT: shldq %cl, %rsi, %rdx
-; AVX512-NEXT: shldq %cl, %r15, %r11
-; AVX512-NEXT: shldq %cl, %r14, %r9
-; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT: shldq %cl, %rbx, %rsi
-; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT: andq 32(%rdi), %r9
-; AVX512-NEXT: andq 48(%rdi), %r11
-; AVX512-NEXT: andq 16(%rdi), %rdx
-; AVX512-NEXT: andq 40(%rdi), %r8
-; AVX512-NEXT: andq 56(%rdi), %r10
-; AVX512-NEXT: andq 24(%rdi), %rax
-; AVX512-NEXT: orq %r11, %rdx
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: andq (%rdi), %rcx
-; AVX512-NEXT: orq %r9, %rcx
-; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: andq 8(%rdi), %rsi
-; AVX512-NEXT: orq %r8, %rsi
-; AVX512-NEXT: orq %rax, %rsi
-; AVX512-NEXT: orq %rcx, %rsi
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: test_ne_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: andl $60, %eax
+; X64-NEXT: movl (%rdi,%rax), %eax
+; X64-NEXT: btl %esi, %eax
+; X64-NEXT: setb %al
+; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -1602,572 +846,33 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $272, %esp # imm = 0x110
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edx), %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edx), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 52(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl 56(%edx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: movl 24(%edx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%eax), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 12(%eax), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl 60(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 28(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: negl %eax
-; X86-NEXT: movl 240(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl 32(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: movl (%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 16(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: movl 48(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 36(%esi), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl %esi, %edi
-; X86-NEXT: movl 52(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl %ebx, 60(%edx)
-; X86-NEXT: movl %edi, 56(%edx)
-; X86-NEXT: movl %ecx, 52(%edx)
-; X86-NEXT: movl %esi, 44(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 40(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 36(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 32(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 28(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 24(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 20(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 16(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 8(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 4(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 48(%edx)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btcl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: complement_ne_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rbx
-; SSE-NEXT: movq (%rsp,%rbx), %rsi
-; SSE-NEXT: movq 8(%rsp,%rbx), %r14
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: shldq %cl, %rsi, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 32(%rsp,%rbx), %r8
-; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT: shldq %cl, %r8, %rbp
-; SSE-NEXT: movq 16(%rsp,%rbx), %r9
-; SSE-NEXT: movq 24(%rsp,%rbx), %r15
-; SSE-NEXT: movq %r15, %r10
-; SSE-NEXT: shldq %cl, %r9, %r10
-; SSE-NEXT: movq -8(%rsp,%rbx), %r11
-; SSE-NEXT: shldq %cl, %r11, %rsi
-; SSE-NEXT: shldq %cl, %r15, %r8
-; SSE-NEXT: shldq %cl, %r14, %r9
-; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %r11
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rbx
-; SSE-NEXT: movq 24(%rdi), %r15
-; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 16(%rdi), %r12
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %r8, %r13
-; SSE-NEXT: andq %rsi, %r12
-; SSE-NEXT: orq %r13, %r12
-; SSE-NEXT: movq %rcx, %r13
-; SSE-NEXT: andq %rbp, %r13
-; SSE-NEXT: andq %rax, %r15
-; SSE-NEXT: orq %r13, %r15
-; SSE-NEXT: movq 32(%rdi), %r14
-; SSE-NEXT: movq %r14, %rcx
-; SSE-NEXT: andq %r9, %rcx
-; SSE-NEXT: movq (%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rbx, %r13
-; SSE-NEXT: orq %rcx, %r13
-; SSE-NEXT: orq %r12, %r13
-; SSE-NEXT: movq 40(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r12
-; SSE-NEXT: andq %r10, %r12
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movq %rdx, %rax
-; SSE-NEXT: andq %r11, %rax
-; SSE-NEXT: orq %r12, %rax
-; SSE-NEXT: orq %r15, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT: xorq %rcx, %r10
-; SSE-NEXT: xorq %r14, %r9
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT: xorq %rdx, %r11
-; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: movq %r8, 48(%rdi)
-; SSE-NEXT: movq %rbp, 56(%rdi)
-; SSE-NEXT: movq %r9, 32(%rdi)
-; SSE-NEXT: movq %r10, 40(%rdi)
-; SSE-NEXT: movq %rsi, 16(%rdi)
-; SSE-NEXT: movq %r15, 24(%rdi)
-; SSE-NEXT: movq %rbx, (%rdi)
-; SSE-NEXT: movq %r11, 8(%rdi)
-; SSE-NEXT: setne %al
-; SSE-NEXT: addq $56, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: complement_ne_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $72, %rsp
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, (%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rbx
-; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT: movq %rbp, %rax
-; AVX2-NEXT: shldq %cl, %rsi, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT: shldq %cl, %r8, %r13
-; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: shldq %cl, %r9, %r10
-; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT: shldq %cl, %r11, %rsi
-; AVX2-NEXT: shldq %cl, %r14, %r8
-; AVX2-NEXT: movq 16(%rdi), %r12
-; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r8, %r14
-; AVX2-NEXT: andq %rsi, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq 56(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r13, %r15
-; AVX2-NEXT: movq 24(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %rax, %r14
-; AVX2-NEXT: orq %r15, %r14
-; AVX2-NEXT: shldq %cl, %rbp, %r9
-; AVX2-NEXT: movq (%rsp,%rbx), %rdx
-; AVX2-NEXT: movq 32(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r9, %r15
-; AVX2-NEXT: shlxq %rcx, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq (%rdi), %rbx
-; AVX2-NEXT: movq %rbx, %rbp
-; AVX2-NEXT: andq %rax, %rbp
-; AVX2-NEXT: orq %r15, %rbp
-; AVX2-NEXT: orq %r12, %rbp
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %rdx, %r11
-; AVX2-NEXT: movq 40(%rdi), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: andq %r10, %rcx
-; AVX2-NEXT: movq 8(%rdi), %r15
-; AVX2-NEXT: movq %r15, %r12
-; AVX2-NEXT: andq %r11, %r12
-; AVX2-NEXT: orq %rcx, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT: xorq %rax, %r10
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: xorq %r15, %r11
-; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: movq %r8, 48(%rdi)
-; AVX2-NEXT: movq %r13, 56(%rdi)
-; AVX2-NEXT: movq %r9, 32(%rdi)
-; AVX2-NEXT: movq %r10, 40(%rdi)
-; AVX2-NEXT: movq %rsi, 16(%rdi)
-; AVX2-NEXT: movq %rcx, 24(%rdi)
-; AVX2-NEXT: movq %rbx, (%rdi)
-; AVX2-NEXT: movq %r11, 8(%rdi)
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: addq $72, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: complement_ne_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $72, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, (%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT: movq %rbp, %rax
-; AVX512-NEXT: shldq %cl, %rsi, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT: shldq %cl, %r8, %r13
-; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %r10
-; AVX512-NEXT: shldq %cl, %r9, %r10
-; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT: shldq %cl, %r11, %rsi
-; AVX512-NEXT: shldq %cl, %r14, %r8
-; AVX512-NEXT: movq 16(%rdi), %r12
-; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r8, %r14
-; AVX512-NEXT: andq %rsi, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq 56(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r13, %r15
-; AVX512-NEXT: movq 24(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %rax, %r14
-; AVX512-NEXT: orq %r15, %r14
-; AVX512-NEXT: shldq %cl, %rbp, %r9
-; AVX512-NEXT: movq (%rsp,%rbx), %rdx
-; AVX512-NEXT: movq 32(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r9, %r15
-; AVX512-NEXT: shlxq %rcx, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq (%rdi), %rbx
-; AVX512-NEXT: movq %rbx, %rbp
-; AVX512-NEXT: andq %rax, %rbp
-; AVX512-NEXT: orq %r15, %rbp
-; AVX512-NEXT: orq %r12, %rbp
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rdx, %r11
-; AVX512-NEXT: movq 40(%rdi), %rax
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andq %r10, %rcx
-; AVX512-NEXT: movq 8(%rdi), %r15
-; AVX512-NEXT: movq %r15, %r12
-; AVX512-NEXT: andq %r11, %r12
-; AVX512-NEXT: orq %rcx, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT: xorq %rax, %r10
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT: xorq %r15, %r11
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT: orq %rbp, %r12
-; AVX512-NEXT: movq %r8, 48(%rdi)
-; AVX512-NEXT: movq %r13, 56(%rdi)
-; AVX512-NEXT: movq %r9, 32(%rdi)
-; AVX512-NEXT: movq %r10, 40(%rdi)
-; AVX512-NEXT: movq %rsi, 16(%rdi)
-; AVX512-NEXT: movq %rcx, 24(%rdi)
-; AVX512-NEXT: movq %rbx, (%rdi)
-; AVX512-NEXT: movq %r11, 8(%rdi)
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: addq $72, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: complement_ne_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $60, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btcl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -2182,606 +887,33 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $288, %esp # imm = 0x120
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edi
-; X86-NEXT: subl %eax, %edi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 4(%edi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edi), %eax
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: shldl %cl, %edx, %ebx
-; X86-NEXT: movl 12(%edi), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edi), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edi), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edi), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edi), %esi
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %edx
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl 52(%edi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl 56(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl 44(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edi), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%edi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: negl %eax
-; X86-NEXT: movl 256(%esp,%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %esi, %edi
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: movl 32(%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %edx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %ebx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: orl %ebx, %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%esi), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: andl %edi, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: movl 52(%ebx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: notl %ebx
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: notl %ecx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl %edx, 60(%eax)
-; X86-NEXT: movl %esi, 56(%eax)
-; X86-NEXT: movl %ecx, 52(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 44(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 40(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 36(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 32(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 28(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 24(%eax)
-; X86-NEXT: movl %ebx, 20(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 16(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 8(%eax)
-; X86-NEXT: movl %edi, 4(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, 48(%eax)
-; X86-NEXT: sete %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setae %al
+; X86-NEXT: btrl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: reset_eq_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rdx
-; SSE-NEXT: movq (%rsp,%rdx), %r9
-; SSE-NEXT: movq 8(%rsp,%rdx), %r8
-; SSE-NEXT: movq %r8, %rsi
-; SSE-NEXT: shldq %cl, %r9, %rsi
-; SSE-NEXT: movq -8(%rsp,%rdx), %rax
-; SSE-NEXT: shldq %cl, %rax, %r9
-; SSE-NEXT: movq 16(%rsp,%rdx), %r14
-; SSE-NEXT: movq 24(%rsp,%rdx), %r10
-; SSE-NEXT: movq %r10, %rbx
-; SSE-NEXT: shldq %cl, %r14, %rbx
-; SSE-NEXT: shldq %cl, %r8, %r14
-; SSE-NEXT: movq 32(%rsp,%rdx), %r13
-; SSE-NEXT: movq 40(%rsp,%rdx), %r12
-; SSE-NEXT: shldq %cl, %r13, %r12
-; SSE-NEXT: shldq %cl, %r10, %r13
-; SSE-NEXT: movq -16(%rsp,%rdx), %rdx
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq %r12, %rbp
-; SSE-NEXT: movq %r9, %r15
-; SSE-NEXT: movq %rsi, %r11
-; SSE-NEXT: movq 16(%rdi), %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %r13
-; SSE-NEXT: andq %r8, %r9
-; SSE-NEXT: orq %r13, %r9
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %r12
-; SSE-NEXT: movq 24(%rdi), %r10
-; SSE-NEXT: andq %r10, %rsi
-; SSE-NEXT: orq %r12, %rsi
-; SSE-NEXT: movq %r14, %r13
-; SSE-NEXT: movq 32(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %r14
-; SSE-NEXT: movq %rdx, %r12
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rcx, %rdx
-; SSE-NEXT: orq %r14, %rdx
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: movq %rbx, %r14
-; SSE-NEXT: movq 40(%rdi), %rcx
-; SSE-NEXT: andq %rcx, %rbx
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: movq 8(%rdi), %r8
-; SSE-NEXT: andq %r8, %rax
-; SSE-NEXT: orq %rbx, %rax
-; SSE-NEXT: orq %rsi, %rax
-; SSE-NEXT: notq %r11
-; SSE-NEXT: andq %r10, %r11
-; SSE-NEXT: notq %r15
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: notq %r14
-; SSE-NEXT: andq %rcx, %r14
-; SSE-NEXT: notq %r13
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT: notq %rbp
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: notq %rcx
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT: notq %r9
-; SSE-NEXT: andq %r8, %r9
-; SSE-NEXT: notq %r12
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rcx, 48(%rdi)
-; SSE-NEXT: movq %rbp, 56(%rdi)
-; SSE-NEXT: movq %r13, 32(%rdi)
-; SSE-NEXT: movq %r14, 40(%rdi)
-; SSE-NEXT: movq %r15, 16(%rdi)
-; SSE-NEXT: movq %r11, 24(%rdi)
-; SSE-NEXT: movq %r12, (%rdi)
-; SSE-NEXT: movq %r9, 8(%rdi)
-; SSE-NEXT: sete %al
-; SSE-NEXT: addq $56, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: reset_eq_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: pushq %rax
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rdx
-; AVX2-NEXT: movq -48(%rsp,%rdx), %r8
-; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx
-; AVX2-NEXT: movq %rbx, %rax
-; AVX2-NEXT: shldq %cl, %r8, %rax
-; AVX2-NEXT: movq -16(%rsp,%rdx), %r10
-; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi
-; AVX2-NEXT: shldq %cl, %r10, %rsi
-; AVX2-NEXT: movq -32(%rsp,%rdx), %r11
-; AVX2-NEXT: movq -24(%rsp,%rdx), %r14
-; AVX2-NEXT: movq %r14, %r9
-; AVX2-NEXT: shldq %cl, %r11, %r9
-; AVX2-NEXT: movq -64(%rsp,%rdx), %r15
-; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx
-; AVX2-NEXT: shldq %cl, %rdx, %r8
-; AVX2-NEXT: shldq %cl, %r14, %r10
-; AVX2-NEXT: shldq %cl, %rbx, %r11
-; AVX2-NEXT: shldq %cl, %r15, %rdx
-; AVX2-NEXT: shlxq %rcx, %r15, %rcx
-; AVX2-NEXT: movq 24(%rdi), %rbx
-; AVX2-NEXT: movq 56(%rdi), %r14
-; AVX2-NEXT: movq 16(%rdi), %r15
-; AVX2-NEXT: movq 48(%rdi), %r13
-; AVX2-NEXT: movq 32(%rdi), %rbp
-; AVX2-NEXT: andnq %rbp, %r11, %r12
-; AVX2-NEXT: andq %r11, %rbp
-; AVX2-NEXT: andnq %r13, %r10, %r11
-; AVX2-NEXT: andq %r10, %r13
-; AVX2-NEXT: andnq %r15, %r8, %r10
-; AVX2-NEXT: andq %r8, %r15
-; AVX2-NEXT: movq 40(%rdi), %r8
-; AVX2-NEXT: orq %r13, %r15
-; AVX2-NEXT: andnq %r8, %r9, %r13
-; AVX2-NEXT: andq %r9, %r8
-; AVX2-NEXT: andnq %r14, %rsi, %r9
-; AVX2-NEXT: andq %rsi, %r14
-; AVX2-NEXT: andnq %rbx, %rax, %rsi
-; AVX2-NEXT: andq %rax, %rbx
-; AVX2-NEXT: movq (%rdi), %rax
-; AVX2-NEXT: orq %r14, %rbx
-; AVX2-NEXT: andnq %rax, %rcx, %r14
-; AVX2-NEXT: andq %rcx, %rax
-; AVX2-NEXT: orq %rbp, %rax
-; AVX2-NEXT: movq 8(%rdi), %rcx
-; AVX2-NEXT: orq %r15, %rax
-; AVX2-NEXT: andnq %rcx, %rdx, %r15
-; AVX2-NEXT: andq %rdx, %rcx
-; AVX2-NEXT: orq %r8, %rcx
-; AVX2-NEXT: orq %rbx, %rcx
-; AVX2-NEXT: orq %rax, %rcx
-; AVX2-NEXT: movq %r11, 48(%rdi)
-; AVX2-NEXT: movq %r9, 56(%rdi)
-; AVX2-NEXT: movq %r12, 32(%rdi)
-; AVX2-NEXT: movq %r13, 40(%rdi)
-; AVX2-NEXT: movq %r10, 16(%rdi)
-; AVX2-NEXT: movq %rsi, 24(%rdi)
-; AVX2-NEXT: movq %r14, (%rdi)
-; AVX2-NEXT: movq %r15, 8(%rdi)
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: addq $8, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: reset_eq_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: pushq %rax
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq -48(%rsp,%rbx), %r8
-; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %rax
-; AVX512-NEXT: shldq %cl, %r8, %rax
-; AVX512-NEXT: movq -16(%rsp,%rbx), %r10
-; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi
-; AVX512-NEXT: shldq %cl, %r10, %rsi
-; AVX512-NEXT: movq -32(%rsp,%rbx), %r11
-; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT: movq %r15, %r9
-; AVX512-NEXT: shldq %cl, %r11, %r9
-; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx
-; AVX512-NEXT: shldq %cl, %rdx, %r8
-; AVX512-NEXT: shldq %cl, %r15, %r10
-; AVX512-NEXT: shldq %cl, %r14, %r11
-; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT: shldq %cl, %rbx, %rdx
-; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT: movq 24(%rdi), %rbx
-; AVX512-NEXT: movq 56(%rdi), %r14
-; AVX512-NEXT: movq 16(%rdi), %r15
-; AVX512-NEXT: movq 48(%rdi), %r13
-; AVX512-NEXT: movq 32(%rdi), %rbp
-; AVX512-NEXT: andnq %rbp, %r11, %r12
-; AVX512-NEXT: andq %r11, %rbp
-; AVX512-NEXT: andnq %r13, %r10, %r11
-; AVX512-NEXT: andq %r10, %r13
-; AVX512-NEXT: andnq %r15, %r8, %r10
-; AVX512-NEXT: andq %r8, %r15
-; AVX512-NEXT: movq 40(%rdi), %r8
-; AVX512-NEXT: orq %r13, %r15
-; AVX512-NEXT: andnq %r8, %r9, %r13
-; AVX512-NEXT: andq %r9, %r8
-; AVX512-NEXT: andnq %r14, %rsi, %r9
-; AVX512-NEXT: andq %rsi, %r14
-; AVX512-NEXT: andnq %rbx, %rax, %rsi
-; AVX512-NEXT: andq %rax, %rbx
-; AVX512-NEXT: movq (%rdi), %rax
-; AVX512-NEXT: orq %r14, %rbx
-; AVX512-NEXT: andnq %rax, %rcx, %r14
-; AVX512-NEXT: andq %rcx, %rax
-; AVX512-NEXT: orq %rbp, %rax
-; AVX512-NEXT: movq 8(%rdi), %rcx
-; AVX512-NEXT: orq %r15, %rax
-; AVX512-NEXT: andnq %rcx, %rdx, %r15
-; AVX512-NEXT: andq %rdx, %rcx
-; AVX512-NEXT: orq %r8, %rcx
-; AVX512-NEXT: orq %rbx, %rcx
-; AVX512-NEXT: orq %rax, %rcx
-; AVX512-NEXT: movq %r11, 48(%rdi)
-; AVX512-NEXT: movq %r9, 56(%rdi)
-; AVX512-NEXT: movq %r12, 32(%rdi)
-; AVX512-NEXT: movq %r13, 40(%rdi)
-; AVX512-NEXT: movq %r10, 16(%rdi)
-; AVX512-NEXT: movq %rsi, 24(%rdi)
-; AVX512-NEXT: movq %r14, (%rdi)
-; AVX512-NEXT: movq %r15, 8(%rdi)
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: addq $8, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: reset_eq_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $60, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setae %al
+; X64-NEXT: btrl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -2797,572 +929,33 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i512:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $272, %esp # imm = 0x110
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: andl $60, %eax
-; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT: subl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 24(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%edx), %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%edx), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%edx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 52(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl 40(%edx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl 56(%edx), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %ebx
-; X86-NEXT: movl 24(%edx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%eax), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 12(%eax), %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: movl 60(%eax), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 28(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl (%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: negl %eax
-; X86-NEXT: movl 240(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %esi
-; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: movl 32(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: movl (%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl 16(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ebx, %eax
-; X86-NEXT: movl 48(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 36(%esi), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 4(%esi), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl %esi, %edi
-; X86-NEXT: movl 52(%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: movl %ebx, 60(%edx)
-; X86-NEXT: movl %edi, 56(%edx)
-; X86-NEXT: movl %ecx, 52(%edx)
-; X86-NEXT: movl %esi, 44(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 40(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 36(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 32(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 28(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 24(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 20(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 16(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 12(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 8(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 4(%edx)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, 48(%edx)
-; X86-NEXT: setne %al
-; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shrl $3, %esi
+; X86-NEXT: andl $60, %esi
+; X86-NEXT: movl (%ecx,%esi), %edi
+; X86-NEXT: btl %edx, %edi
+; X86-NEXT: setb %al
+; X86-NEXT: btsl %edx, %edi
+; X86-NEXT: movl %edi, (%ecx,%esi)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: set_ne_i512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $56, %rsp
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %rbx
-; SSE-NEXT: movq (%rsp,%rbx), %rsi
-; SSE-NEXT: movq 8(%rsp,%rbx), %r14
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: shldq %cl, %rsi, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 32(%rsp,%rbx), %r8
-; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT: shldq %cl, %r8, %rbp
-; SSE-NEXT: movq 16(%rsp,%rbx), %r9
-; SSE-NEXT: movq 24(%rsp,%rbx), %r15
-; SSE-NEXT: movq %r15, %r10
-; SSE-NEXT: shldq %cl, %r9, %r10
-; SSE-NEXT: movq -8(%rsp,%rbx), %r11
-; SSE-NEXT: shldq %cl, %r11, %rsi
-; SSE-NEXT: shldq %cl, %r15, %r8
-; SSE-NEXT: shldq %cl, %r14, %r9
-; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT: shldq %cl, %rbx, %r11
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shlq %cl, %rbx
-; SSE-NEXT: movq 24(%rdi), %r15
-; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 16(%rdi), %r12
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %r8, %r13
-; SSE-NEXT: andq %rsi, %r12
-; SSE-NEXT: orq %r13, %r12
-; SSE-NEXT: movq %rcx, %r13
-; SSE-NEXT: andq %rbp, %r13
-; SSE-NEXT: andq %rax, %r15
-; SSE-NEXT: orq %r13, %r15
-; SSE-NEXT: movq 32(%rdi), %r14
-; SSE-NEXT: movq %r14, %rcx
-; SSE-NEXT: andq %r9, %rcx
-; SSE-NEXT: movq (%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rbx, %r13
-; SSE-NEXT: orq %rcx, %r13
-; SSE-NEXT: orq %r12, %r13
-; SSE-NEXT: movq 40(%rdi), %rcx
-; SSE-NEXT: movq %rcx, %r12
-; SSE-NEXT: andq %r10, %r12
-; SSE-NEXT: movq 8(%rdi), %rdx
-; SSE-NEXT: movq %rdx, %rax
-; SSE-NEXT: andq %r11, %rax
-; SSE-NEXT: orq %r12, %rax
-; SSE-NEXT: orq %r15, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT: orq %rcx, %r10
-; SSE-NEXT: orq %r14, %r9
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT: orq %rdx, %r11
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: movq %r8, 48(%rdi)
-; SSE-NEXT: movq %rbp, 56(%rdi)
-; SSE-NEXT: movq %r9, 32(%rdi)
-; SSE-NEXT: movq %r10, 40(%rdi)
-; SSE-NEXT: movq %rsi, 16(%rdi)
-; SSE-NEXT: movq %r15, 24(%rdi)
-; SSE-NEXT: movq %rbx, (%rdi)
-; SSE-NEXT: movq %r11, 8(%rdi)
-; SSE-NEXT: setne %al
-; SSE-NEXT: addq $56, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX2-LABEL: set_ne_i512:
-; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $72, %rsp
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, (%rsp)
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rbx
-; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT: movq %rbp, %rax
-; AVX2-NEXT: shldq %cl, %rsi, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT: shldq %cl, %r8, %r13
-; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: shldq %cl, %r9, %r10
-; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT: shldq %cl, %r11, %rsi
-; AVX2-NEXT: shldq %cl, %r14, %r8
-; AVX2-NEXT: movq 16(%rdi), %r12
-; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r8, %r14
-; AVX2-NEXT: andq %rsi, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq 56(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r13, %r15
-; AVX2-NEXT: movq 24(%rdi), %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %rax, %r14
-; AVX2-NEXT: orq %r15, %r14
-; AVX2-NEXT: shldq %cl, %rbp, %r9
-; AVX2-NEXT: movq (%rsp,%rbx), %rdx
-; AVX2-NEXT: movq 32(%rdi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r9, %r15
-; AVX2-NEXT: shlxq %rcx, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq (%rdi), %rbx
-; AVX2-NEXT: movq %rbx, %rbp
-; AVX2-NEXT: andq %rax, %rbp
-; AVX2-NEXT: orq %r15, %rbp
-; AVX2-NEXT: orq %r12, %rbp
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %rdx, %r11
-; AVX2-NEXT: movq 40(%rdi), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: andq %r10, %rcx
-; AVX2-NEXT: movq 8(%rdi), %r15
-; AVX2-NEXT: movq %r15, %r12
-; AVX2-NEXT: andq %r11, %r12
-; AVX2-NEXT: orq %rcx, %r12
-; AVX2-NEXT: orq %r14, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT: orq %rax, %r10
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: orq %r15, %r11
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: movq %r8, 48(%rdi)
-; AVX2-NEXT: movq %r13, 56(%rdi)
-; AVX2-NEXT: movq %r9, 32(%rdi)
-; AVX2-NEXT: movq %r10, 40(%rdi)
-; AVX2-NEXT: movq %rsi, 16(%rdi)
-; AVX2-NEXT: movq %rcx, 24(%rdi)
-; AVX2-NEXT: movq %rbx, (%rdi)
-; AVX2-NEXT: movq %r11, 8(%rdi)
-; AVX2-NEXT: setne %al
-; AVX2-NEXT: addq $72, %rsp
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: set_ne_i512:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $72, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, (%rsp)
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rbx
-; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT: movq %rbp, %rax
-; AVX512-NEXT: shldq %cl, %rsi, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT: shldq %cl, %r8, %r13
-; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT: movq %r14, %r10
-; AVX512-NEXT: shldq %cl, %r9, %r10
-; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT: shldq %cl, %r11, %rsi
-; AVX512-NEXT: shldq %cl, %r14, %r8
-; AVX512-NEXT: movq 16(%rdi), %r12
-; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 48(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r8, %r14
-; AVX512-NEXT: andq %rsi, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq 56(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r13, %r15
-; AVX512-NEXT: movq 24(%rdi), %r14
-; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %rax, %r14
-; AVX512-NEXT: orq %r15, %r14
-; AVX512-NEXT: shldq %cl, %rbp, %r9
-; AVX512-NEXT: movq (%rsp,%rbx), %rdx
-; AVX512-NEXT: movq 32(%rdi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r9, %r15
-; AVX512-NEXT: shlxq %rcx, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq (%rdi), %rbx
-; AVX512-NEXT: movq %rbx, %rbp
-; AVX512-NEXT: andq %rax, %rbp
-; AVX512-NEXT: orq %r15, %rbp
-; AVX512-NEXT: orq %r12, %rbp
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rdx, %r11
-; AVX512-NEXT: movq 40(%rdi), %rax
-; AVX512-NEXT: movq %rax, %rcx
-; AVX512-NEXT: andq %r10, %rcx
-; AVX512-NEXT: movq 8(%rdi), %r15
-; AVX512-NEXT: movq %r15, %r12
-; AVX512-NEXT: andq %r11, %r12
-; AVX512-NEXT: orq %rcx, %r12
-; AVX512-NEXT: orq %r14, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT: orq %rax, %r10
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT: orq %r15, %r11
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT: orq %rbp, %r12
-; AVX512-NEXT: movq %r8, 48(%rdi)
-; AVX512-NEXT: movq %r13, 56(%rdi)
-; AVX512-NEXT: movq %r9, 32(%rdi)
-; AVX512-NEXT: movq %r10, 40(%rdi)
-; AVX512-NEXT: movq %rsi, 16(%rdi)
-; AVX512-NEXT: movq %rcx, 24(%rdi)
-; AVX512-NEXT: movq %rbx, (%rdi)
-; AVX512-NEXT: movq %r11, 8(%rdi)
-; AVX512-NEXT: setne %al
-; AVX512-NEXT: addq $72, %rsp
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; X64-LABEL: set_ne_i512:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $60, %ecx
+; X64-NEXT: movl (%rdi,%rcx), %edx
+; X64-NEXT: btl %esi, %edx
+; X64-NEXT: setb %al
+; X64-NEXT: btsl %esi, %edx
+; X64-NEXT: movl %edx, (%rdi,%rcx)
+; X64-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -3383,13 +976,14 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $432, %esp # imm = 0x1B0
+; X86-NEXT: subl $352, %esp # imm = 0x160
; X86-NEXT: movl 12(%ebp), %ecx
; X86-NEXT: movl %ecx, %edx
; X86-NEXT: shrl $3, %edx
; X86-NEXT: andl $60, %edx
-; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -3422,60 +1016,58 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 56(%esi), %eax
+; X86-NEXT: movl 56(%eax), %esi
+; X86-NEXT: movl 60(%eax), %ebx
+; X86-NEXT: movl 52(%eax), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%eax), %edi
+; X86-NEXT: movl 44(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%eax), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%esi), %eax
+; X86-NEXT: movzbl 16(%ebp), %eax
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shldl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%esi), %eax
-; X86-NEXT: movl 48(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%esi), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%esi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl 16(%ebp), %ebx
-; X86-NEXT: movzbl %bl, %esi
-; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT: subl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl %cl, %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -3500,9 +1092,12 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -3534,273 +1129,148 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edx
+; X86-NEXT: movl 56(%eax), %esi
+; X86-NEXT: movl 60(%eax), %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: andl 60(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 52(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 56(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 48(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 52(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 44(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 48(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 40(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 44(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 36(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 40(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 32(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 36(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 28(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 32(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 24(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 28(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 20(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 24(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 16(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 20(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 12(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 16(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 8(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 12(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 4(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 8(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: notl %esi
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: andl 4(%edx), %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: notl %esi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: andl (%edx), %esi
+; X86-NEXT: orl %eax, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%edx,%eax), %eax
+; X86-NEXT: btl %ecx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl %eax, 60(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, 56(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%ebx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %eax, %edx
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %eax, 52(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%ebx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %esi, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl (%ebx), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edi, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, 48(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%ebx), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %edx, %eax
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 4(%ebx), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl %edx, %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl 56(%edi), %ebx
-; X86-NEXT: movl 60(%edi), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 52(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 48(%edi), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, 44(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl 40(%edi), %ebx
-; X86-NEXT: movl 44(%edi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 36(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 32(%edi), %ebx
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 28(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 24(%edi), %ebx
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 20(%edi), %eax
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 16(%edi), %ebx
-; X86-NEXT: shldl %cl, %ebx, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl 12(%edi), %eax
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: notl %esi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 8(%edi), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, 40(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: notl %eax
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 4(%edi), %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl %eax, 36(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl %cl, %esi, %eax
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: notl %edx
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl (%edi), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, %ebx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: notl %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %eax, 32(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%edx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 60(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 56(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 52(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 44(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 40(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 36(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 32(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 28(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 24(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 20(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 16(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 12(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %edx, 4(%eax)
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %esi, 48(%eax)
-; X86-NEXT: sete %al
+; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 20(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 16(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%edx)
+; X86-NEXT: movl %ebx, 8(%edx)
+; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %esi, (%edx)
+; X86-NEXT: setae %al
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -3816,7 +1286,8 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-NEXT: pushq %r13
; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $216, %rsp
+; SSE-NEXT: subq $184, %rsp
+; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; SSE-NEXT: xorps %xmm0, %xmm0
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
@@ -3829,139 +1300,103 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: negl %esi
-; SSE-NEXT: movslq %esi, %r10
-; SSE-NEXT: movq 184(%rsp,%r10), %r11
-; SSE-NEXT: movq 192(%rsp,%r10), %rsi
-; SSE-NEXT: movq %rsi, %r13
-; SSE-NEXT: shldq %cl, %r11, %r13
-; SSE-NEXT: movq 200(%rsp,%r10), %r15
-; SSE-NEXT: shldq %cl, %rsi, %r15
-; SSE-NEXT: movq 168(%rsp,%r10), %rbx
-; SSE-NEXT: movq 176(%rsp,%r10), %rsi
-; SSE-NEXT: movq %rsi, %r14
-; SSE-NEXT: shldq %cl, %rbx, %r14
-; SSE-NEXT: shldq %cl, %rsi, %r11
-; SSE-NEXT: movq 152(%rsp,%r10), %rax
-; SSE-NEXT: movq 160(%rsp,%r10), %r8
-; SSE-NEXT: movq %r8, %r12
-; SSE-NEXT: shldq %cl, %rax, %r12
-; SSE-NEXT: shldq %cl, %r8, %rbx
-; SSE-NEXT: movq 144(%rsp,%r10), %r9
-; SSE-NEXT: movq %r9, %r8
-; SSE-NEXT: shlq %cl, %r8
-; SSE-NEXT: shldq %cl, %r9, %rax
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movl %edx, %edx
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %eax
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: andl $56, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: movslq %eax, %r12
+; SSE-NEXT: movq 160(%rsp,%r12), %rax
+; SSE-NEXT: movq 168(%rsp,%r12), %r10
+; SSE-NEXT: shldq %cl, %rax, %r10
+; SSE-NEXT: movq 152(%rsp,%r12), %rsi
+; SSE-NEXT: shldq %cl, %rsi, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 144(%rsp,%r12), %r11
+; SSE-NEXT: shldq %cl, %r11, %rsi
+; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 136(%rsp,%r12), %rbx
+; SSE-NEXT: shldq %cl, %rbx, %r11
+; SSE-NEXT: movq 128(%rsp,%r12), %r14
+; SSE-NEXT: shldq %cl, %r14, %rbx
+; SSE-NEXT: movq 120(%rsp,%r12), %r15
+; SSE-NEXT: shldq %cl, %r15, %r14
+; SSE-NEXT: movq 112(%rsp,%r12), %r13
+; SSE-NEXT: shldq %cl, %r13, %r15
+; SSE-NEXT: shlq %cl, %r13
+; SSE-NEXT: movl %edx, %eax
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq 16(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %rsi
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rsi, %r13
-; SSE-NEXT: andq %rdx, %r12
-; SSE-NEXT: orq %r13, %r12
-; SSE-NEXT: movq %r15, %rsi
-; SSE-NEXT: movq 56(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %r15
-; SSE-NEXT: movq %rbx, %r13
-; SSE-NEXT: movq 24(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %rbx
-; SSE-NEXT: orq %r15, %rbx
-; SSE-NEXT: movq %r14, %rbp
-; SSE-NEXT: movq 32(%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %r14
-; SSE-NEXT: movq %r8, %r15
-; SSE-NEXT: movq (%rdi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %r8
-; SSE-NEXT: orq %r14, %r8
-; SSE-NEXT: orq %r12, %r8
-; SSE-NEXT: movq %r11, %r12
-; SSE-NEXT: movq 40(%rdi), %r9
-; SSE-NEXT: andq %r9, %r11
-; SSE-NEXT: movq %rax, %r14
-; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq 32(%rsp,%r12), %rax
+; SSE-NEXT: movq 40(%rsp,%r12), %rdx
+; SSE-NEXT: shldq %cl, %rax, %rdx
; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq %rdx, %rax
-; SSE-NEXT: orq %r11, %rax
-; SSE-NEXT: orq %rbx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 24(%rsp,%r12), %rdx
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq 16(%rsp,%r12), %rsi
+; SSE-NEXT: shldq %cl, %rsi, %rdx
+; SSE-NEXT: movq 8(%rsp,%r12), %r8
+; SSE-NEXT: shldq %cl, %r8, %rsi
+; SSE-NEXT: movq (%rsp,%r12), %rbp
+; SSE-NEXT: shldq %cl, %rbp, %r8
+; SSE-NEXT: movq -8(%rsp,%r12), %r9
+; SSE-NEXT: shldq %cl, %r9, %rbp
+; SSE-NEXT: notq %r10
+; SSE-NEXT: andq 56(%rdi), %r10
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE-NEXT: notq %r10
+; SSE-NEXT: andq 48(%rdi), %r10
+; SSE-NEXT: orq %rax, %r10
; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
; SSE-NEXT: notq %rax
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT: andq 40(%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: movq 56(%rsp,%r10), %r11
-; SSE-NEXT: movq 64(%rsp,%r10), %rax
-; SSE-NEXT: movq %rax, %rbx
-; SSE-NEXT: shldq %cl, %r11, %rbx
-; SSE-NEXT: orq %rbx, %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: notq %rsi
-; SSE-NEXT: movq 72(%rsp,%r10), %rbx
-; SSE-NEXT: shldq %cl, %rax, %rbx
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT: orq %rbx, %rsi
-; SSE-NEXT: notq %rbp
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT: movq 40(%rsp,%r10), %rax
-; SSE-NEXT: movq 48(%rsp,%r10), %rdx
-; SSE-NEXT: movq %rdx, %rbx
-; SSE-NEXT: shldq %cl, %rax, %rbx
-; SSE-NEXT: orq %rbx, %rbp
-; SSE-NEXT: notq %r12
-; SSE-NEXT: andq %r9, %r12
-; SSE-NEXT: shldq %cl, %rdx, %r11
-; SSE-NEXT: movq 24(%rsp,%r10), %r9
-; SSE-NEXT: movq 32(%rsp,%r10), %rdx
-; SSE-NEXT: movq %rdx, %rbx
-; SSE-NEXT: shldq %cl, %r9, %rbx
-; SSE-NEXT: orq %r11, %r12
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
; SSE-NEXT: notq %r11
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: orq %rbx, %r11
-; SSE-NEXT: notq %r13
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT: orq %rax, %r13
-; SSE-NEXT: notq %r15
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: movq 16(%rsp,%r10), %rax
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: shlq %cl, %rdx
-; SSE-NEXT: orq %rdx, %r15
+; SSE-NEXT: andq 32(%rdi), %r11
+; SSE-NEXT: orq %rsi, %r11
+; SSE-NEXT: notq %rbx
+; SSE-NEXT: andq 24(%rdi), %rbx
+; SSE-NEXT: orq %r8, %rbx
; SSE-NEXT: notq %r14
-; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: andq 16(%rdi), %r14
+; SSE-NEXT: orq %rbp, %r14
+; SSE-NEXT: notq %r15
+; SSE-NEXT: movq -16(%rsp,%r12), %rax
; SSE-NEXT: shldq %cl, %rax, %r9
-; SSE-NEXT: orq %r9, %r14
-; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: andq 8(%rdi), %r15
+; SSE-NEXT: orq %r9, %r15
+; SSE-NEXT: notq %r13
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rax
+; SSE-NEXT: andq (%rdi), %r13
+; SSE-NEXT: orq %rax, %r13
; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: movq %rax, 48(%rdi)
-; SSE-NEXT: movq %rsi, 56(%rdi)
-; SSE-NEXT: movq %rbp, 32(%rdi)
-; SSE-NEXT: movq %r12, 40(%rdi)
-; SSE-NEXT: movq %r11, 16(%rdi)
-; SSE-NEXT: movq %r13, 24(%rdi)
-; SSE-NEXT: movq %r15, (%rdi)
-; SSE-NEXT: movq %r14, 8(%rdi)
-; SSE-NEXT: sete %al
-; SSE-NEXT: addq $216, %rsp
+; SSE-NEXT: andl $60, %eax
+; SSE-NEXT: movl (%rdi,%rax), %eax
+; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; SSE-NEXT: btl %ecx, %eax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: movq %rax, 56(%rdi)
+; SSE-NEXT: movq %r10, 48(%rdi)
+; SSE-NEXT: movq %rdx, 40(%rdi)
+; SSE-NEXT: movq %r11, 32(%rdi)
+; SSE-NEXT: movq %rbx, 24(%rdi)
+; SSE-NEXT: movq %r14, 16(%rdi)
+; SSE-NEXT: movq %r15, 8(%rdi)
+; SSE-NEXT: movq %r13, (%rdi)
+; SSE-NEXT: setae %al
+; SSE-NEXT: addq $184, %rsp
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r12
; SSE-NEXT: popq %r13
@@ -3978,132 +1413,103 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $200, %rsp
+; AVX2-NEXT: subq $168, %rsp
; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %esi, %r8d
-; AVX2-NEXT: andl $63, %r8d
-; AVX2-NEXT: shrl $3, %esi
-; AVX2-NEXT: andl $56, %esi
-; AVX2-NEXT: negl %esi
-; AVX2-NEXT: movslq %esi, %rsi
-; AVX2-NEXT: movq 144(%rsp,%rsi), %r11
-; AVX2-NEXT: movq 152(%rsp,%rsi), %r12
-; AVX2-NEXT: movq %r12, %r10
-; AVX2-NEXT: movl %r8d, %ecx
-; AVX2-NEXT: shldq %cl, %r11, %r10
-; AVX2-NEXT: movq 176(%rsp,%rsi), %r14
-; AVX2-NEXT: movq 184(%rsp,%rsi), %r9
-; AVX2-NEXT: shldq %cl, %r14, %r9
-; AVX2-NEXT: movq 160(%rsp,%rsi), %r15
-; AVX2-NEXT: movq 168(%rsp,%rsi), %r13
-; AVX2-NEXT: movq %r13, %rbx
-; AVX2-NEXT: shldq %cl, %r15, %rbx
-; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp
-; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 136(%rsp,%rsi), %rax
-; AVX2-NEXT: shldq %cl, %rax, %r11
-; AVX2-NEXT: shldq %cl, %r13, %r14
-; AVX2-NEXT: shldq %cl, %r12, %r15
-; AVX2-NEXT: shldq %cl, %rbp, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movl %edx, %edx
+; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: movl %esi, %r11d
+; AVX2-NEXT: shrl $3, %r11d
+; AVX2-NEXT: movl %r11d, %eax
+; AVX2-NEXT: andl $56, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: movslq %eax, %r10
+; AVX2-NEXT: movq 104(%rsp,%r10), %r15
+; AVX2-NEXT: movq 112(%rsp,%r10), %rax
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: shldq %cl, %r15, %rsi
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 120(%rsp,%r10), %rsi
+; AVX2-NEXT: movq %rsi, %r8
+; AVX2-NEXT: shldq %cl, %rax, %r8
+; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 128(%rsp,%r10), %rax
+; AVX2-NEXT: movq %rax, %rbx
+; AVX2-NEXT: shldq %cl, %rsi, %rbx
+; AVX2-NEXT: movq 136(%rsp,%r10), %rsi
+; AVX2-NEXT: movq %rsi, %r14
+; AVX2-NEXT: shldq %cl, %rax, %r14
+; AVX2-NEXT: movq 144(%rsp,%r10), %rax
+; AVX2-NEXT: movq %rax, %r12
+; AVX2-NEXT: shldq %cl, %rsi, %r12
+; AVX2-NEXT: movq 96(%rsp,%r10), %rsi
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 152(%rsp,%r10), %r13
+; AVX2-NEXT: shldq %cl, %rax, %r13
+; AVX2-NEXT: shldq %cl, %rsi, %r15
+; AVX2-NEXT: movl %edx, %eax
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdx, (%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq 16(%rdi), %r12
-; AVX2-NEXT: movq 48(%rdi), %rbp
-; AVX2-NEXT: movq 32(%rdi), %r13
-; AVX2-NEXT: andnq %r13, %r15, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r15, %r13
-; AVX2-NEXT: andnq %rbp, %r14, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r14, %rbp
-; AVX2-NEXT: andnq %r12, %r11, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r11, %r12
-; AVX2-NEXT: movq 40(%rdi), %rax
+; AVX2-NEXT: movq 16(%rsp,%r10), %rbp
+; AVX2-NEXT: movq 24(%rsp,%r10), %r9
+; AVX2-NEXT: shldq %cl, %rbp, %r9
+; AVX2-NEXT: movq 8(%rsp,%r10), %rdx
+; AVX2-NEXT: shldq %cl, %rdx, %rbp
+; AVX2-NEXT: movq (%rsp,%r10), %rax
+; AVX2-NEXT: shldq %cl, %rax, %rdx
+; AVX2-NEXT: movq -8(%rsp,%r10), %r8
+; AVX2-NEXT: shldq %cl, %r8, %rax
+; AVX2-NEXT: movq -16(%rsp,%r10), %rsi
+; AVX2-NEXT: shldq %cl, %rsi, %r8
+; AVX2-NEXT: andnq 56(%rdi), %r13, %r13
+; AVX2-NEXT: orq %r9, %r13
+; AVX2-NEXT: movq -24(%rsp,%r10), %r9
+; AVX2-NEXT: shldq %cl, %r9, %rsi
+; AVX2-NEXT: andnq 48(%rdi), %r12, %r12
+; AVX2-NEXT: andnq 40(%rdi), %r14, %r14
; AVX2-NEXT: orq %rbp, %r12
-; AVX2-NEXT: andnq %rax, %rbx, %rcx
-; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq %rax, %rbp
-; AVX2-NEXT: andq %rbx, %rbp
-; AVX2-NEXT: movq 56(%rdi), %rcx
-; AVX2-NEXT: andnq %rcx, %r9, %rbx
-; AVX2-NEXT: andq %r9, %rcx
-; AVX2-NEXT: movq 24(%rdi), %rax
-; AVX2-NEXT: andnq %rax, %r10, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq %r10, %rax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: movq (%rdi), %r10
-; AVX2-NEXT: andnq %r10, %rcx, %r15
-; AVX2-NEXT: andq %rcx, %r10
-; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq 48(%rsp,%rsi), %r11
-; AVX2-NEXT: movq %r11, %r9
-; AVX2-NEXT: movl %r8d, %ecx
-; AVX2-NEXT: shldq %cl, %rdx, %r9
-; AVX2-NEXT: orq %r13, %r10
-; AVX2-NEXT: orq %r12, %r10
-; AVX2-NEXT: movq 8(%rdi), %r13
+; AVX2-NEXT: orq %rdx, %r14
+; AVX2-NEXT: andnq 32(%rdi), %rbx, %rdx
+; AVX2-NEXT: orq %rax, %rdx
+; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX2-NEXT: movq -32(%rsp,%r10), %r10
+; AVX2-NEXT: shlxq %rcx, %r10, %rbx
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %r10, %r9
; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: andnq %r13, %rcx, %r12
-; AVX2-NEXT: andq %rcx, %r13
-; AVX2-NEXT: orq %rbp, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq 56(%rsp,%rsi), %rax
-; AVX2-NEXT: movl %r8d, %ecx
-; AVX2-NEXT: shldq %cl, %r11, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: orq %r9, %r14
-; AVX2-NEXT: orq %rax, %rbx
-; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 24(%rsp,%rsi), %rax
-; AVX2-NEXT: movq 32(%rsp,%rsi), %r9
-; AVX2-NEXT: movq %r9, %r11
-; AVX2-NEXT: shldq %cl, %rax, %r11
-; AVX2-NEXT: shldq %cl, %r9, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT: orq %r11, %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: orq %rdx, %rbx
-; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq 16(%rsp,%rsi), %r9
-; AVX2-NEXT: movq %r9, %r11
-; AVX2-NEXT: shldq %cl, %rdx, %r11
-; AVX2-NEXT: shldq %cl, %r9, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: orq %r11, %r9
-; AVX2-NEXT: movq (%rsp,%rsi), %rsi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: orq %rax, %r11
-; AVX2-NEXT: shlxq %r8, %rsi, %rax
-; AVX2-NEXT: shldq %cl, %rsi, %rdx
-; AVX2-NEXT: orq %rax, %r15
-; AVX2-NEXT: orq %rdx, %r12
-; AVX2-NEXT: orq %r10, %r13
-; AVX2-NEXT: movq %r14, 48(%rdi)
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: movq %rax, 56(%rdi)
-; AVX2-NEXT: movq %rbp, 32(%rdi)
-; AVX2-NEXT: movq %rbx, 40(%rdi)
-; AVX2-NEXT: movq %r9, 16(%rdi)
-; AVX2-NEXT: movq %r11, 24(%rdi)
-; AVX2-NEXT: movq %r15, (%rdi)
-; AVX2-NEXT: movq %r12, 8(%rdi)
-; AVX2-NEXT: sete %al
-; AVX2-NEXT: addq $200, %rsp
+; AVX2-NEXT: andnq 24(%rdi), %rcx, %rcx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andnq 16(%rdi), %r10, %r10
+; AVX2-NEXT: orq %r8, %rcx
+; AVX2-NEXT: orq %rsi, %r10
+; AVX2-NEXT: andnq 8(%rdi), %r15, %rsi
+; AVX2-NEXT: orq %r9, %rsi
+; AVX2-NEXT: andnq (%rdi), %rax, %rax
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: andl $60, %r11d
+; AVX2-NEXT: movl (%rdi,%r11), %r8d
+; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; AVX2-NEXT: btl %r9d, %r8d
+; AVX2-NEXT: movq %r13, 56(%rdi)
+; AVX2-NEXT: movq %r12, 48(%rdi)
+; AVX2-NEXT: movq %r14, 40(%rdi)
+; AVX2-NEXT: movq %rdx, 32(%rdi)
+; AVX2-NEXT: movq %rcx, 24(%rdi)
+; AVX2-NEXT: movq %r10, 16(%rdi)
+; AVX2-NEXT: movq %rsi, 8(%rdi)
+; AVX2-NEXT: movq %rax, (%rdi)
+; AVX2-NEXT: setae %al
+; AVX2-NEXT: addq $168, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
@@ -4121,131 +1527,100 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; AVX512-NEXT: pushq %r13
; AVX512-NEXT: pushq %r12
; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $184, %rsp
+; AVX512-NEXT: subq $152, %rsp
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512-NEXT: movl %esi, %ecx
; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %esi
-; AVX512-NEXT: andl $56, %esi
-; AVX512-NEXT: negl %esi
-; AVX512-NEXT: movslq %esi, %rsi
-; AVX512-NEXT: movq 128(%rsp,%rsi), %r10
-; AVX512-NEXT: movq 136(%rsp,%rsi), %r12
-; AVX512-NEXT: movq %r12, %rax
-; AVX512-NEXT: shldq %cl, %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 160(%rsp,%rsi), %r14
-; AVX512-NEXT: movq 168(%rsp,%rsi), %rax
-; AVX512-NEXT: shldq %cl, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 144(%rsp,%rsi), %r15
-; AVX512-NEXT: movq 152(%rsp,%rsi), %r11
-; AVX512-NEXT: movq %r11, %rbx
-; AVX512-NEXT: shldq %cl, %r15, %rbx
-; AVX512-NEXT: movq 120(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rax, %r10
-; AVX512-NEXT: shldq %cl, %r11, %r14
-; AVX512-NEXT: movq %rdi, %r9
-; AVX512-NEXT: movq 112(%rsp,%rsi), %r11
-; AVX512-NEXT: shldq %cl, %r12, %r15
-; AVX512-NEXT: movl %edx, %edx
+; AVX512-NEXT: movl %esi, %r8d
+; AVX512-NEXT: shrl $3, %r8d
+; AVX512-NEXT: movl %r8d, %eax
+; AVX512-NEXT: andl $56, %eax
+; AVX512-NEXT: negl %eax
+; AVX512-NEXT: movslq %eax, %r9
+; AVX512-NEXT: movq 88(%rsp,%r9), %r10
+; AVX512-NEXT: movq 96(%rsp,%r9), %rax
+; AVX512-NEXT: movq %rax, %rsi
+; AVX512-NEXT: shldq %cl, %r10, %rsi
+; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 104(%rsp,%r9), %rsi
+; AVX512-NEXT: movq %rsi, %r11
+; AVX512-NEXT: shldq %cl, %rax, %r11
+; AVX512-NEXT: movq 112(%rsp,%r9), %rax
+; AVX512-NEXT: movq %rax, %rbx
+; AVX512-NEXT: shldq %cl, %rsi, %rbx
+; AVX512-NEXT: movq 120(%rsp,%r9), %rsi
+; AVX512-NEXT: movq %rsi, %r14
+; AVX512-NEXT: shldq %cl, %rax, %r14
+; AVX512-NEXT: movq 128(%rsp,%r9), %rax
+; AVX512-NEXT: movq %rax, %r12
+; AVX512-NEXT: shldq %cl, %rsi, %r12
+; AVX512-NEXT: movq 136(%rsp,%r9), %r13
+; AVX512-NEXT: shldq %cl, %rax, %r13
+; AVX512-NEXT: movq 80(%rsp,%r9), %r15
+; AVX512-NEXT: shldq %cl, %r15, %r10
+; AVX512-NEXT: movl %edx, %eax
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq 16(%rdi), %r12
-; AVX512-NEXT: movq 48(%rdi), %r13
-; AVX512-NEXT: movq 32(%rdi), %rbp
-; AVX512-NEXT: andnq %rbp, %r15, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r15, %rbp
-; AVX512-NEXT: andnq %r13, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r14, %r13
-; AVX512-NEXT: andnq %r12, %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq %r10, %r12
-; AVX512-NEXT: movq 40(%rdi), %r8
-; AVX512-NEXT: orq %r13, %r12
-; AVX512-NEXT: andnq %r8, %rbx, %rdi
-; AVX512-NEXT: andq %rbx, %r8
-; AVX512-NEXT: movq 56(%r9), %r13
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT: andnq %r13, %rdx, %r10
-; AVX512-NEXT: andq %rdx, %r13
-; AVX512-NEXT: movq 24(%r9), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT: andnq %rax, %rdx, %r15
-; AVX512-NEXT: andq %rdx, %rax
-; AVX512-NEXT: orq %r13, %rax
-; AVX512-NEXT: shlxq %rcx, %r11, %r13
-; AVX512-NEXT: movq (%r9), %rdx
-; AVX512-NEXT: andnq %rdx, %r13, %r14
-; AVX512-NEXT: andq %r13, %rdx
-; AVX512-NEXT: orq %rbp, %rdx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r11, %rbp
-; AVX512-NEXT: orq %r12, %rdx
-; AVX512-NEXT: movq 8(%r9), %r13
-; AVX512-NEXT: andnq %r13, %rbp, %rbx
-; AVX512-NEXT: andq %rbp, %r13
-; AVX512-NEXT: orq %r8, %r13
-; AVX512-NEXT: movq 24(%rsp,%rsi), %r8
-; AVX512-NEXT: orq %rax, %r13
-; AVX512-NEXT: movq 32(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, %r12
-; AVX512-NEXT: shldq %cl, %r8, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: orq %r12, %r11
-; AVX512-NEXT: movq 40(%rsp,%rsi), %r12
-; AVX512-NEXT: shldq %cl, %rax, %r12
-; AVX512-NEXT: orq %r12, %r10
-; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 8(%rsp,%rsi), %rax
-; AVX512-NEXT: movq 16(%rsp,%rsi), %r12
-; AVX512-NEXT: movq %r12, %rbp
-; AVX512-NEXT: shldq %cl, %rax, %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: orq %rbp, %r10
-; AVX512-NEXT: shldq %cl, %r12, %r8
-; AVX512-NEXT: orq %r8, %rdi
-; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq -8(%rsp,%rsi), %r8
-; AVX512-NEXT: movq (%rsp,%rsi), %r12
-; AVX512-NEXT: movq %r12, %rbp
-; AVX512-NEXT: shldq %cl, %r8, %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT: orq %rbp, %rdi
-; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi
-; AVX512-NEXT: shldq %cl, %r12, %rax
+; AVX512-NEXT: movq (%rsp,%r9), %rbp
+; AVX512-NEXT: movq 8(%rsp,%r9), %rsi
+; AVX512-NEXT: shldq %cl, %rbp, %rsi
+; AVX512-NEXT: movq -8(%rsp,%r9), %rdx
+; AVX512-NEXT: shldq %cl, %rdx, %rbp
+; AVX512-NEXT: movq -16(%rsp,%r9), %rax
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: andnq 56(%rdi), %r13, %r13
+; AVX512-NEXT: andnq 48(%rdi), %r12, %r12
+; AVX512-NEXT: orq %rsi, %r13
+; AVX512-NEXT: orq %rbp, %r12
+; AVX512-NEXT: andnq 40(%rdi), %r14, %r14
+; AVX512-NEXT: orq %rdx, %r14
+; AVX512-NEXT: movq -24(%rsp,%r9), %rsi
+; AVX512-NEXT: shldq %cl, %rsi, %rax
+; AVX512-NEXT: andnq 32(%rdi), %rbx, %rdx
+; AVX512-NEXT: orq %rax, %rdx
+; AVX512-NEXT: movq -32(%rsp,%r9), %rax
+; AVX512-NEXT: shldq %cl, %rax, %rsi
+; AVX512-NEXT: shlxq %rcx, %r15, %rbx
+; AVX512-NEXT: andnq 24(%rdi), %r11, %r11
+; AVX512-NEXT: orq %rsi, %r11
+; AVX512-NEXT: movq -48(%rsp,%r9), %rsi
+; AVX512-NEXT: movq -40(%rsp,%r9), %r9
+; AVX512-NEXT: shldq %cl, %r9, %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: andnq 16(%rdi), %r15, %r15
; AVX512-NEXT: orq %rax, %r15
; AVX512-NEXT: shlxq %rcx, %rsi, %rax
; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rsi, %r8
-; AVX512-NEXT: orq %rax, %r14
-; AVX512-NEXT: orq %r8, %rbx
-; AVX512-NEXT: orq %rdx, %r13
-; AVX512-NEXT: movq %r11, 48(%r9)
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, 56(%r9)
-; AVX512-NEXT: movq %r10, 32(%r9)
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, 40(%r9)
-; AVX512-NEXT: movq %rdi, 16(%r9)
-; AVX512-NEXT: movq %r15, 24(%r9)
-; AVX512-NEXT: movq %r14, (%r9)
-; AVX512-NEXT: movq %rbx, 8(%r9)
-; AVX512-NEXT: sete %al
-; AVX512-NEXT: addq $184, %rsp
+; AVX512-NEXT: shldq %cl, %rsi, %r9
+; AVX512-NEXT: andnq 8(%rdi), %r10, %rcx
+; AVX512-NEXT: orq %r9, %rcx
+; AVX512-NEXT: andnq (%rdi), %rbx, %rsi
+; AVX512-NEXT: orq %rax, %rsi
+; AVX512-NEXT: andl $60, %r8d
+; AVX512-NEXT: movl (%rdi,%r8), %eax
+; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload
+; AVX512-NEXT: btl %r8d, %eax
+; AVX512-NEXT: movq %r13, 56(%rdi)
+; AVX512-NEXT: movq %r12, 48(%rdi)
+; AVX512-NEXT: movq %r14, 40(%rdi)
+; AVX512-NEXT: movq %rdx, 32(%rdi)
+; AVX512-NEXT: movq %r11, 24(%rdi)
+; AVX512-NEXT: movq %r15, 16(%rdi)
+; AVX512-NEXT: movq %rcx, 8(%rdi)
+; AVX512-NEXT: movq %rsi, (%rdi)
+; AVX512-NEXT: setae %al
+; AVX512-NEXT: addq $152, %rsp
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r12
; AVX512-NEXT: popq %r13
@@ -4274,144 +1649,48 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i4096:
; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $4064, %edx # imm = 0xFE0
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%eax,%edx), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: setb %al
+; X86-NEXT: retl
+;
+; X64-LABEL: test_ne_i4096:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %eax
+; X64-NEXT: andl $4064, %eax # imm = 0xFE0
+; X64-NEXT: shrl $3, %eax
+; X64-NEXT: movl (%rdi,%rax), %eax
+; X64-NEXT: btl %esi, %eax
+; X64-NEXT: setb %al
+; X64-NEXT: retq
+ %rem = and i32 %position, 4095
+ %ofs = zext nneg i32 %rem to i4096
+ %bit = shl nuw i4096 1, %ofs
+ %ld = load i4096, ptr %word
+ %test = and i4096 %ld, %bit
+ %cmp = icmp ne i4096 %test, 0
+ ret i1 %cmp
+}
+
+; Special Cases
+
+; Multiple uses of the stored value
+define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
+; X86-LABEL: complement_cmpz_i128:
+; X86: # %bb.0:
; X86-NEXT: pushl %ebp
; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $1792, %esp # imm = 0x700
-; X86-NEXT: movl 12(%ebp), %ebx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: shrl $3, %ecx
-; X86-NEXT: andl $508, %ecx # imm = 0x1FC
-; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: subl %ecx, %esi
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: subl $64, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -4420,1061 +1699,35 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 248(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 252(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: andl $31, %ebx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 504(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 508(%esi), %edx
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 120(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 124(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 376(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 380(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 184(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 188(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 440(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 444(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 56(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 60(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 312(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 316(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 216(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 220(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 472(%esi), %edi
-; X86-NEXT: movl 476(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 88(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 92(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 344(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 348(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 152(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 156(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 408(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 412(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 280(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 284(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 232(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 236(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 488(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 492(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 104(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 108(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 360(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 364(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 168(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 172(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 424(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 428(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 296(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 300(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 200(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 204(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 456(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 460(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 72(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 76(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 328(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 332(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 136(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 140(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 392(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 396(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 264(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 268(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 240(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 244(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 496(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 500(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 112(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 116(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 368(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 372(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 176(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 180(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 432(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 436(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 48(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 52(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 304(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 308(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 208(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 212(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 464(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 468(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 80(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 84(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 336(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 340(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 144(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 148(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 400(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 404(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 20(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 272(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 276(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 224(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 228(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 480(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 484(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 100(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 352(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 356(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 160(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 164(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 416(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 420(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 32(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 288(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 292(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 192(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 196(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 448(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 452(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 64(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 68(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 320(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 324(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 128(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 132(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %esi
+; X86-NEXT: movl 36(%esp,%esi), %eax
+; X86-NEXT: movl 40(%esp,%esi), %edi
; X86-NEXT: movl %edi, %edx
-; X86-NEXT: movl 256(%esi), %edi
-; X86-NEXT: movl 260(%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl %cl, %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shldl %cl, %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 388(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 4(%esi), %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shrdl $1, %eax, %edi
-; X86-NEXT: shrl %eax
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: notb %cl
-; X86-NEXT: shrdl %cl, %eax, %edi
-; X86-NEXT: shrl %cl, %ebx
-; X86-NEXT: movb $32, %cl
-; X86-NEXT: testb %cl, %cl
-; X86-NEXT: movl (%esi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 8(%ebp), %eax
-; X86-NEXT: jne .LBB20_2
-; X86-NEXT: # %bb.1:
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: .LBB20_2:
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shll %cl, %edx
-; X86-NEXT: orl %ebx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 320(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 64(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 448(%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 192(%eax), %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 288(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 32(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 416(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 160(%eax), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 352(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 96(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 480(%eax), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 224(%eax), %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: orl %edi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 272(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 16(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 400(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 144(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 336(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 80(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 464(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 208(%eax), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 304(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 48(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 432(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 176(%eax), %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 368(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 112(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 496(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: andl 240(%eax), %ebx
-; X86-NEXT: orl %ecx, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: orl %esi, %ebx
-; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl 32(%esp,%esi), %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 264(%eax), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 8(%eax), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 392(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 136(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 328(%ebx), %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 72(%ebx), %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 456(%ebx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 200(%ebx), %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 296(%ebx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 40(%ebx), %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 424(%ebx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 168(%ebx), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 360(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 104(%ebx), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 488(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 232(%ebx), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 280(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 24(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 408(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 152(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 344(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 88(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 472(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 216(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 312(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 56(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 440(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 184(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 376(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 120(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 504(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 248(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 324(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 68(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 452(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 196(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 292(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 36(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 420(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 164(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 356(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 100(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 484(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 228(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 276(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 20(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 404(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 148(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 340(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 84(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 468(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 212(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 308(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 52(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 436(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 180(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 372(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 116(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 500(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 244(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 268(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 12(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 396(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 140(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 332(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 76(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 460(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 204(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 300(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 44(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 428(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 172(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 364(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 108(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 492(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: andl 236(%ebx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: orl %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 284(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 28(%ebx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 412(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 156(%ebx), %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 348(%ebx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 92(%ebx), %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 476(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 220(%ebx), %eax
-; X86-NEXT: orl %ecx, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: orl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 316(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 60(%ebx), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 444(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: andl 188(%ebx), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: orl %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 380(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: andl 124(%ebx), %edx
-; X86-NEXT: orl %ecx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 508(%ebx), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl 8(%ebp), %esi
-; X86-NEXT: andl 252(%esi), %ebx
-; X86-NEXT: orl %ecx, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: negl %ecx
-; X86-NEXT: movl 1648(%esp,%ecx), %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl 44(%esp,%esi), %esi
; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl %cl, %edx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl 8(%ebp), %edx
-; X86-NEXT: andl 128(%edx), %ecx
-; X86-NEXT: andl 384(%edx), %edi
-; X86-NEXT: orl %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: andl (%edx), %eax
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 256(%edx), %eax
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: xorl 12(%ecx), %esi
+; X86-NEXT: xorl 8(%ecx), %edx
+; X86-NEXT: xorl 4(%ecx), %eax
+; X86-NEXT: xorl (%ecx), %edi
+; X86-NEXT: movl %edx, 8(%ecx)
+; X86-NEXT: movl %esi, 12(%ecx)
+; X86-NEXT: movl %edi, (%ecx)
+; X86-NEXT: movl %eax, 4(%ecx)
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl %edx, %edi
; X86-NEXT: orl %eax, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 260(%edx), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: andl 4(%edx), %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: andl 132(%edx), %eax
-; X86-NEXT: andl 388(%edx), %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: orl %ecx, %esi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: orl %ebx, %esi
-; X86-NEXT: orl %edi, %esi
; X86-NEXT: setne %al
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
@@ -5483,1545 +1736,231 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: test_ne_i4096:
+; SSE-LABEL: complement_cmpz_i128:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: subq $1576, %rsp # imm = 0x628
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl %esi, %eax
-; SSE-NEXT: andl $4032, %eax # imm = 0xFC0
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %eax
-; SSE-NEXT: negl %eax
-; SSE-NEXT: movslq %eax, %rsi
-; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1304(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1560(%rsp,%rsi), %rax
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1176(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1432(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1240(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1496(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1112(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; SSE-NEXT: movq 1368(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1272(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1528(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1144(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1400(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1208(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1464(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1080(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1336(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1288(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1544(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1160(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1416(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1216(%rsp,%rsi), %r11
-; SSE-NEXT: movq 1224(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %r11, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1480(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1088(%rsp,%rsi), %r9
-; SSE-NEXT: movq 1096(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %r9, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1352(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1248(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1512(%rsp,%rsi), %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rdx, %rax
-; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1120(%rsp,%rsi), %rax
-; SSE-NEXT: movq 1128(%rsp,%rsi), %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: shldq %cl, %rax, %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1376(%rsp,%rsi), %r13
-; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx
-; SSE-NEXT: movq %rbx, %r8
-; SSE-NEXT: shldq %cl, %r13, %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1192(%rsp,%rsi), %r15
-; SSE-NEXT: movq %r15, %r14
-; SSE-NEXT: shldq %cl, %rdx, %r14
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1440(%rsp,%rsi), %r10
-; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx
-; SSE-NEXT: movq %rdx, %r14
-; SSE-NEXT: shldq %cl, %r10, %r14
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1312(%rsp,%rsi), %r14
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp
-; SSE-NEXT: movq %rbp, %r12
-; SSE-NEXT: shldq %cl, %r14, %r12
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx
-; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rbp, %r14
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: shldq %cl, %rdx, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r9
-; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %rbp
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r15, %r13
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r12, %r15
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT: shldq %cl, %r12, %r10
-; SSE-NEXT: andq 384(%rdi), %r10
-; SSE-NEXT: andq 128(%rdi), %r15
-; SSE-NEXT: andq 320(%rdi), %r13
-; SSE-NEXT: andq 64(%rdi), %rax
-; SSE-NEXT: orq %r10, %r15
-; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: andq 448(%rdi), %r9
-; SSE-NEXT: andq 192(%rdi), %rbp
-; SSE-NEXT: orq %r9, %rbp
-; SSE-NEXT: orq %rax, %rbp
-; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: andq 288(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 32(%rdi), %r9
-; SSE-NEXT: andq 416(%rdi), %rdx
-; SSE-NEXT: andq 160(%rdi), %r11
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: orq %rdx, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 352(%rdi), %rdx
-; SSE-NEXT: orq %r9, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 96(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 480(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 224(%rdi), %r8
-; SSE-NEXT: orq %rax, %r8
-; SSE-NEXT: orq %rdx, %r8
-; SSE-NEXT: andq 272(%rdi), %r14
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 16(%rdi), %rax
-; SSE-NEXT: orq %r14, %rax
-; SSE-NEXT: movq %rax, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 400(%rdi), %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 144(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: movq %rax, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 336(%rdi), %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 80(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 464(%rdi), %rdx
-; SSE-NEXT: orq %r9, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 208(%rdi), %r11
-; SSE-NEXT: orq %rdx, %r11
-; SSE-NEXT: orq %rax, %r11
-; SSE-NEXT: orq %r8, %r11
-; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload
-; SSE-NEXT: andq 304(%rdi), %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 48(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 432(%rdi), %r9
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 176(%rdi), %r8
-; SSE-NEXT: orq %r9, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 368(%rdi), %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 112(%rdi), %rax
-; SSE-NEXT: orq %r10, %r8
-; SSE-NEXT: movq %r8, %r10
-; SSE-NEXT: orq %r9, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 496(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT: andq 240(%rdi), %rbp
-; SSE-NEXT: orq %r8, %rbp
-; SSE-NEXT: orq %rax, %rbp
-; SSE-NEXT: orq %r10, %rbp
-; SSE-NEXT: orq %r11, %rbp
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 392(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT: andq 136(%rdi), %r12
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 328(%rdi), %rdx
-; SSE-NEXT: orq %rax, %r12
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 72(%rdi), %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 456(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE-NEXT: andq 200(%rdi), %r13
-; SSE-NEXT: orq %rax, %r13
-; SSE-NEXT: orq %rdx, %r13
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 296(%rdi), %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 40(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 424(%rdi), %r8
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %rax, %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT: andq 168(%rdi), %rdx
-; SSE-NEXT: orq %r8, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 360(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 104(%rdi), %rax
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: movq %rax, %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 488(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: andq 232(%rdi), %r15
-; SSE-NEXT: orq %rax, %r15
-; SSE-NEXT: orq %r8, %r15
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 280(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 24(%rdi), %rax
-; SSE-NEXT: orq %rdx, %r15
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 408(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 152(%rdi), %rax
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 344(%rdi), %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 88(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 472(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT: andq 216(%rdi), %r14
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: orq %rax, %r14
-; SSE-NEXT: orq %r8, %r14
-; SSE-NEXT: orq %r10, %r14
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 312(%rdi), %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT: andq 56(%rdi), %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 440(%rdi), %r8
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT: andq 184(%rdi), %r9
-; SSE-NEXT: orq %r11, %r10
-; SSE-NEXT: orq %r8, %r9
-; SSE-NEXT: orq %r10, %r9
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: xorl %edx, %edx
; SSE-NEXT: shldq %cl, %rax, %rdx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT: andq 376(%rdi), %r10
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT: andq 120(%rdi), %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: andq 504(%rdi), %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: andq 248(%rdi), %r8
-; SSE-NEXT: orq %r10, %rax
-; SSE-NEXT: movq %rax, %r10
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: movq 1056(%rsp,%rsi), %rax
-; SSE-NEXT: shldq %cl, %rax, %rbx
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: orq %r10, %r8
-; SSE-NEXT: orq %r9, %r8
-; SSE-NEXT: andq 256(%rdi), %rdx
-; SSE-NEXT: orq %r14, %r8
-; SSE-NEXT: andq (%rdi), %rax
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: cmovneq %rsi, %rax
+; SSE-NEXT: xorq 8(%rdi), %rdx
+; SSE-NEXT: xorq (%rdi), %rax
+; SSE-NEXT: movq %rax, (%rdi)
+; SSE-NEXT: movq %rdx, 8(%rdi)
; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT: orq %rbp, %rax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: andq 264(%rdi), %rcx
-; SSE-NEXT: andq 8(%rdi), %rbx
-; SSE-NEXT: orq %rcx, %rbx
-; SSE-NEXT: orq %r12, %rbx
-; SSE-NEXT: orq %r13, %rbx
-; SSE-NEXT: orq %r15, %rbx
-; SSE-NEXT: orq %r8, %rbx
-; SSE-NEXT: orq %rax, %rbx
; SSE-NEXT: setne %al
-; SSE-NEXT: addq $1576, %rsp # imm = 0x628
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
-; AVX2-LABEL: test_ne_i4096:
+; AVX2-LABEL: complement_cmpz_i128:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $1560, %rsp # imm = 0x618
; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: movl %esi, %eax
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %eax
-; AVX2-NEXT: negl %eax
-; AVX2-NEXT: movslq %eax, %rsi
-; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11
-; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %r11, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12
-; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %r12, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rdx, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rax, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp
-; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: shldq %cl, %rbp, %rax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax
-; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl $1, %eax
+; AVX2-NEXT: xorl %edx, %edx
; AVX2-NEXT: shldq %cl, %rax, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10
-; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8
-; AVX2-NEXT: movq %r8, %rdx
-; AVX2-NEXT: shldq %cl, %r10, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx
-; AVX2-NEXT: movq %rbx, %rdx
-; AVX2-NEXT: shldq %cl, %r9, %rdx
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9
-; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx
-; AVX2-NEXT: movq %rdx, %r14
-; AVX2-NEXT: shldq %cl, %r9, %r14
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15
-; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14
-; AVX2-NEXT: movq %r14, %r13
-; AVX2-NEXT: shldq %cl, %r15, %r13
-; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx
-; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r13
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r14, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r15, %r9
-; AVX2-NEXT: andq 384(%rdi), %r9
-; AVX2-NEXT: andq 128(%rdi), %r14
-; AVX2-NEXT: andq 320(%rdi), %r10
-; AVX2-NEXT: orq %r9, %r14
-; AVX2-NEXT: movq %r14, %r15
-; AVX2-NEXT: andq 64(%rdi), %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: andq 448(%rdi), %rbp
-; AVX2-NEXT: andq 192(%rdi), %r13
-; AVX2-NEXT: orq %rbp, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: andq 288(%rdi), %r8
-; AVX2-NEXT: andq 32(%rdi), %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 416(%rdi), %rax
-; AVX2-NEXT: orq %r8, %r12
-; AVX2-NEXT: andq 160(%rdi), %r11
-; AVX2-NEXT: orq %rax, %r11
-; AVX2-NEXT: andq 352(%rdi), %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 96(%rdi), %rax
-; AVX2-NEXT: orq %r12, %r11
-; AVX2-NEXT: orq %rbx, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 480(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: andq 224(%rdi), %r13
-; AVX2-NEXT: orq %r10, %r13
-; AVX2-NEXT: orq %rax, %r13
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 272(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 16(%rdi), %rax
-; AVX2-NEXT: orq %r11, %r13
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: andq 400(%rdi), %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 144(%rdi), %rax
-; AVX2-NEXT: orq %r9, %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 336(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 80(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 464(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: andq 208(%rdi), %r11
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: orq %r8, %r11
-; AVX2-NEXT: orq %rax, %r11
-; AVX2-NEXT: orq %r9, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: andq 304(%rdi), %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 48(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 432(%rdi), %r10
-; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload
-; AVX2-NEXT: andq 176(%rdi), %rax
-; AVX2-NEXT: orq %r9, %r8
-; AVX2-NEXT: movq %r8, %r9
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 368(%rdi), %r8
-; AVX2-NEXT: orq %r9, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 112(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 496(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT: andq 240(%rdi), %r9
-; AVX2-NEXT: orq %r8, %r9
-; AVX2-NEXT: orq %rax, %r9
-; AVX2-NEXT: orq %r10, %r9
-; AVX2-NEXT: orq %r11, %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 392(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT: andq 136(%rdi), %rbp
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 328(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 72(%rdi), %rax
-; AVX2-NEXT: orq %r10, %rbp
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 456(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX2-NEXT: andq 200(%rdi), %r12
-; AVX2-NEXT: orq %rax, %r12
-; AVX2-NEXT: orq %r8, %r12
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 296(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 40(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: andq 424(%rdi), %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 168(%rdi), %rax
-; AVX2-NEXT: orq %r10, %r8
-; AVX2-NEXT: movq %r8, %r10
-; AVX2-NEXT: orq %r11, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 360(%rdi), %r8
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 104(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 488(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: andq 232(%rdi), %r14
-; AVX2-NEXT: orq %rax, %r14
-; AVX2-NEXT: orq %r8, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 280(%rdi), %r8
-; AVX2-NEXT: orq %r10, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 24(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 408(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 152(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: andq 344(%rdi), %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 88(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 472(%rdi), %rax
-; AVX2-NEXT: orq %r11, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: andq 216(%rdi), %rbx
-; AVX2-NEXT: orq %rax, %rbx
-; AVX2-NEXT: orq %r8, %rbx
-; AVX2-NEXT: orq %r10, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 312(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 56(%rdi), %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 440(%rdi), %r10
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: movq %rax, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 184(%rdi), %r8
-; AVX2-NEXT: orq %r10, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: andq 376(%rdi), %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 120(%rdi), %rax
-; AVX2-NEXT: orq %r11, %r8
-; AVX2-NEXT: movq %r8, %r11
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: andq 504(%rdi), %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: andq 248(%rdi), %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: shldq %cl, %r8, %r10
-; AVX2-NEXT: orq %r11, %rax
-; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi
-; AVX2-NEXT: orq %rbx, %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: shlxq %rcx, %rsi, %rax
-; AVX2-NEXT: andq 256(%rdi), %r10
-; AVX2-NEXT: andq (%rdi), %rax
-; AVX2-NEXT: orq %r10, %rax
-; AVX2-NEXT: orq %r15, %rax
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT: orq %r13, %rax
-; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT: shldq %cl, %rsi, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: andq 264(%rdi), %rcx
-; AVX2-NEXT: andq 8(%rdi), %rdx
-; AVX2-NEXT: orq %r9, %rax
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: orq %rbp, %rdx
-; AVX2-NEXT: orq %r12, %rdx
-; AVX2-NEXT: orq %r14, %rdx
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: orq %rax, %rdx
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: shlxq %rcx, %rax, %rax
+; AVX2-NEXT: testb $64, %cl
+; AVX2-NEXT: cmovneq %rax, %rdx
+; AVX2-NEXT: cmovneq %rsi, %rax
+; AVX2-NEXT: xorq 8(%rdi), %rdx
+; AVX2-NEXT: xorq (%rdi), %rax
+; AVX2-NEXT: movq %rax, (%rdi)
+; AVX2-NEXT: movq %rdx, 8(%rdi)
+; AVX2-NEXT: orq %rdx, %rax
; AVX2-NEXT: setne %al
-; AVX2-NEXT: addq $1560, %rsp # imm = 0x618
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
-; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: test_ne_i4096:
+; AVX512-LABEL: complement_cmpz_i128:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $1560, %rsp # imm = 0x618
; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl %esi, %eax
-; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT: andl $63, %ecx
-; AVX512-NEXT: shrl $3, %eax
-; AVX512-NEXT: negl %eax
-; AVX512-NEXT: movslq %eax, %rsi
-; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10
-; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %r10, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14
-; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %r14, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rdx, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12
-; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %r12, %rax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax
-; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11
-; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx
-; AVX512-NEXT: movq %rbx, %rdx
-; AVX512-NEXT: shldq %cl, %r11, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9
-; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8
-; AVX512-NEXT: movq %r8, %rdx
-; AVX512-NEXT: shldq %cl, %r9, %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9
-; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx
-; AVX512-NEXT: movq %rdx, %r15
-; AVX512-NEXT: shldq %cl, %r9, %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp
-; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15
-; AVX512-NEXT: movq %r15, %r13
-; AVX512-NEXT: shldq %cl, %rbp, %r13
-; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx
-; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %rbx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r14
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r13
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %r15, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbp, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rbp, %r9
-; AVX512-NEXT: andq 384(%rdi), %r9
-; AVX512-NEXT: andq 128(%rdi), %r15
-; AVX512-NEXT: orq %r9, %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andq 320(%rdi), %r11
-; AVX512-NEXT: andq 64(%rdi), %rax
-; AVX512-NEXT: orq %r11, %rax
-; AVX512-NEXT: andq 448(%rdi), %r12
-; AVX512-NEXT: andq 192(%rdi), %r13
-; AVX512-NEXT: orq %r12, %r13
-; AVX512-NEXT: orq %rax, %r13
-; AVX512-NEXT: andq 288(%rdi), %r8
-; AVX512-NEXT: andq 32(%rdi), %r14
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 416(%rdi), %rax
-; AVX512-NEXT: orq %r8, %r14
-; AVX512-NEXT: andq 160(%rdi), %r10
-; AVX512-NEXT: orq %rax, %r10
-; AVX512-NEXT: andq 352(%rdi), %rbx
-; AVX512-NEXT: orq %r14, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 96(%rdi), %rax
-; AVX512-NEXT: orq %rbx, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 480(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: andq 224(%rdi), %r15
-; AVX512-NEXT: orq %rax, %r15
-; AVX512-NEXT: orq %r8, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 272(%rdi), %r8
-; AVX512-NEXT: orq %r10, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 16(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 400(%rdi), %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 144(%rdi), %rax
-; AVX512-NEXT: orq %r9, %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 336(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 80(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 464(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: andq 208(%rdi), %r11
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: orq %r8, %r11
-; AVX512-NEXT: orq %rax, %r11
-; AVX512-NEXT: orq %r9, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 304(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 48(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 432(%rdi), %r9
-; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 176(%rdi), %r8
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: orq %r9, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 368(%rdi), %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 112(%rdi), %rax
-; AVX512-NEXT: orq %r10, %r8
-; AVX512-NEXT: movq %r8, %r10
-; AVX512-NEXT: orq %r9, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 496(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT: andq 240(%rdi), %r9
-; AVX512-NEXT: orq %r8, %r9
-; AVX512-NEXT: orq %rax, %r9
-; AVX512-NEXT: orq %r10, %r9
-; AVX512-NEXT: orq %r11, %r9
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 392(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT: andq 136(%rdi), %rbp
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 328(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 72(%rdi), %rax
-; AVX512-NEXT: orq %r10, %rbp
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 456(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT: andq 200(%rdi), %r12
-; AVX512-NEXT: orq %rax, %r12
-; AVX512-NEXT: orq %r8, %r12
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 296(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 40(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 424(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 168(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 360(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 104(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 488(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT: andq 232(%rdi), %r14
-; AVX512-NEXT: orq %rax, %r14
-; AVX512-NEXT: orq %r8, %r14
-; AVX512-NEXT: orq %r10, %r14
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 280(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 24(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 408(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 152(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: andq 344(%rdi), %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 88(%rdi), %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 472(%rdi), %rax
-; AVX512-NEXT: orq %r11, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: andq 216(%rdi), %rbx
-; AVX512-NEXT: orq %rax, %rbx
-; AVX512-NEXT: orq %r8, %rbx
-; AVX512-NEXT: orq %r10, %rbx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: andq 312(%rdi), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 56(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 440(%rdi), %r8
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 184(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 376(%rdi), %r8
-; AVX512-NEXT: orq %r10, %rax
-; AVX512-NEXT: movq %rax, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 120(%rdi), %rax
-; AVX512-NEXT: orq %r8, %rax
-; AVX512-NEXT: movq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 504(%rdi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: andq 248(%rdi), %r8
-; AVX512-NEXT: orq %rax, %r8
-; AVX512-NEXT: orq %r10, %r8
-; AVX512-NEXT: orq %r11, %r8
-; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: shldq %cl, %rsi, %r10
-; AVX512-NEXT: orq %rbx, %r8
-; AVX512-NEXT: shlxq %rcx, %rax, %rsi
-; AVX512-NEXT: andq 256(%rdi), %r10
-; AVX512-NEXT: andq (%rdi), %rsi
-; AVX512-NEXT: orq %r10, %rsi
-; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT: orq %r13, %rsi
-; AVX512-NEXT: orq %r15, %rsi
-; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT: shldq %cl, %rax, %rdx
-; AVX512-NEXT: orq %r9, %rsi
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: andq 264(%rdi), %rax
-; AVX512-NEXT: andq 8(%rdi), %rdx
-; AVX512-NEXT: orq %rax, %rdx
-; AVX512-NEXT: orq %rbp, %rdx
-; AVX512-NEXT: orq %r12, %rdx
-; AVX512-NEXT: orq %r14, %rdx
-; AVX512-NEXT: orq %r8, %rdx
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movl $1, %edx
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: shldq %cl, %rdx, %rsi
+; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX512-NEXT: testb $64, %cl
+; AVX512-NEXT: cmovneq %rdx, %rsi
+; AVX512-NEXT: cmovneq %rax, %rdx
+; AVX512-NEXT: xorq 8(%rdi), %rsi
+; AVX512-NEXT: xorq (%rdi), %rdx
+; AVX512-NEXT: movq %rdx, (%rdi)
+; AVX512-NEXT: movq %rsi, 8(%rdi)
; AVX512-NEXT: orq %rsi, %rdx
; AVX512-NEXT: setne %al
-; AVX512-NEXT: addq $1560, %rsp # imm = 0x618
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
- %rem = and i32 %position, 4095
- %ofs = zext nneg i32 %rem to i4096
- %bit = shl nuw i4096 1, %ofs
- %ld = load i4096, ptr %word
- %test = and i4096 %ld, %bit
- %cmp = icmp ne i4096 %test, 0
+ %rem = and i32 %position, 127
+ %ofs = zext nneg i32 %rem to i128
+ %bit = shl nuw i128 1, %ofs
+ %ld = load i128, ptr %word
+ %res = xor i128 %ld, %bit
+ store i128 %res, ptr %word
+ %cmp = icmp ne i128 %res, 0
ret i1 %cmp
}
+
+; Multiple loads in store chain
+define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
+; X86-LABEL: reset_multiload_i128:
+; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $64, %esp
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %edi
+; X86-NEXT: movl 36(%esp,%edi), %edx
+; X86-NEXT: movl 40(%esp,%edi), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl 32(%esp,%edi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%esp,%edi), %edi
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: andl $96, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl (%ecx,%eax), %eax
+; X86-NEXT: andl %ebx, (%ecx)
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: notl %edx
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: andl %edx, 4(%ebx)
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, 8(%ebx)
+; X86-NEXT: notl %edi
+; X86-NEXT: andl %edi, 12(%ebx)
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: jae .LBB22_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB22_2:
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
+; X86-NEXT: retl
+;
+; SSE-LABEL: reset_multiload_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %esi
+; SSE-NEXT: xorl %r8d, %r8d
+; SSE-NEXT: shldq %cl, %rsi, %r8
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: shlq %cl, %rsi
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rsi, %r8
+; SSE-NEXT: cmovneq %rax, %rsi
+; SSE-NEXT: notq %r8
+; SSE-NEXT: notq %rsi
+; SSE-NEXT: movl %ecx, %r9d
+; SSE-NEXT: andl $96, %r9d
+; SSE-NEXT: shrl $3, %r9d
+; SSE-NEXT: movl (%rdi,%r9), %r9d
+; SSE-NEXT: btl %ecx, %r9d
+; SSE-NEXT: jb .LBB22_2
+; SSE-NEXT: # %bb.1:
+; SSE-NEXT: movl (%rdx), %eax
+; SSE-NEXT: .LBB22_2:
+; SSE-NEXT: andq %r8, 8(%rdi)
+; SSE-NEXT: andq %rsi, (%rdi)
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: reset_multiload_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: movl $1, %r8d
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: shldq %cl, %r8, %rsi
+; AVX2-NEXT: shlxq %rcx, %r8, %r8
+; AVX2-NEXT: testb $64, %cl
+; AVX2-NEXT: cmovneq %r8, %rsi
+; AVX2-NEXT: cmovneq %rax, %r8
+; AVX2-NEXT: notq %rsi
+; AVX2-NEXT: notq %r8
+; AVX2-NEXT: movl %ecx, %r9d
+; AVX2-NEXT: andl $96, %r9d
+; AVX2-NEXT: shrl $3, %r9d
+; AVX2-NEXT: movl (%rdi,%r9), %r9d
+; AVX2-NEXT: btl %ecx, %r9d
+; AVX2-NEXT: jb .LBB22_2
+; AVX2-NEXT: # %bb.1:
+; AVX2-NEXT: movl (%rdx), %eax
+; AVX2-NEXT: .LBB22_2:
+; AVX2-NEXT: andq %rsi, 8(%rdi)
+; AVX2-NEXT: andq %r8, (%rdi)
+; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reset_multiload_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: movl $1, %r8d
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: shldq %cl, %r8, %rsi
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: shlxq %rcx, %r8, %r8
+; AVX512-NEXT: testb $64, %cl
+; AVX512-NEXT: cmovneq %r8, %rsi
+; AVX512-NEXT: cmovneq %rax, %r8
+; AVX512-NEXT: notq %rsi
+; AVX512-NEXT: notq %r8
+; AVX512-NEXT: movl %ecx, %r9d
+; AVX512-NEXT: andl $96, %r9d
+; AVX512-NEXT: shrl $3, %r9d
+; AVX512-NEXT: movl (%rdi,%r9), %r9d
+; AVX512-NEXT: btl %ecx, %r9d
+; AVX512-NEXT: jb .LBB22_2
+; AVX512-NEXT: # %bb.1:
+; AVX512-NEXT: movl (%rdx), %eax
+; AVX512-NEXT: .LBB22_2:
+; AVX512-NEXT: andq %rsi, 8(%rdi)
+; AVX512-NEXT: andq %r8, (%rdi)
+; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT: retq
+ %rem = and i32 %position, 127
+ %ofs = zext nneg i32 %rem to i128
+ %bit = shl nuw i128 1, %ofs
+ %mask = xor i128 %bit, -1
+ %ld = load i128, ptr %word
+ %sel = load i32, ptr %p
+ %test = and i128 %ld, %bit
+ %res = and i128 %ld, %mask
+ %cmp = icmp eq i128 %test, 0
+ store i128 %res, ptr %word
+ %ret = select i1 %cmp, i32 %sel, i32 0
+ ret i32 %ret
+}