diff options
Diffstat (limited to 'llvm/test/CodeGen/X86/bittest-big-integer.ll')
| -rw-r--r-- | llvm/test/CodeGen/X86/bittest-big-integer.ll | 7137 | 
1 files changed, 1038 insertions, 6099 deletions
| diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 19d751d1..06e7d47 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {  define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {  ; X86-LABEL: test_ne_i64:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %esi  ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax -; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT:    movl $1, %edx -; X86-NEXT:    xorl %esi, %esi -; X86-NEXT:    shldl %cl, %edx, %esi -; X86-NEXT:    shll %cl, %edx -; X86-NEXT:    testb $32, %cl -; X86-NEXT:    je .LBB5_2 -; X86-NEXT:  # %bb.1: -; X86-NEXT:    movl %edx, %esi -; X86-NEXT:    xorl %edx, %edx -; X86-NEXT:  .LBB5_2: -; X86-NEXT:    andl 4(%eax), %esi -; X86-NEXT:    andl (%eax), %edx -; X86-NEXT:    orl %esi, %edx -; X86-NEXT:    setne %al -; X86-NEXT:    popl %esi +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl %ecx, %edx +; X86-NEXT:    andl $32, %edx +; X86-NEXT:    shrl $3, %edx +; X86-NEXT:    movl (%eax,%edx), %eax +; X86-NEXT:    btl %ecx, %eax +; X86-NEXT:    setb %al  ; X86-NEXT:    retl  ;  ; X64-LABEL: test_ne_i64: @@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {  define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {  ; X86-LABEL: complement_ne_i64:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx  ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx -; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT:    movl $1, %eax -; X86-NEXT:    xorl %esi, %esi -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    shll %cl, %eax -; X86-NEXT:    testb $32, %cl -; X86-NEXT:    je .LBB6_2 -; X86-NEXT:  # %bb.1: -; X86-NEXT:    movl %eax, %esi -; X86-NEXT:    xorl %eax, %eax -; X86-NEXT:  .LBB6_2: -; X86-NEXT:    movl (%edx), %ecx -; X86-NEXT:    movl 4(%edx), %edi -; X86-NEXT:    movl %edi, %ebx -; X86-NEXT:    andl %esi, %ebx -; X86-NEXT:    movl %ecx, %ebp -; X86-NEXT:    andl %eax, %ebp -; X86-NEXT:    xorl %esi, %edi -; X86-NEXT:    xorl %eax, %ecx -; X86-NEXT:    orl %ebx, %ebp -; X86-NEXT:    setne %al -; X86-NEXT:    movl %ecx, (%edx) -; X86-NEXT:    movl %edi, 4(%edx) +; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    andl $32, %esi +; X86-NEXT:    shrl $3, %esi +; X86-NEXT:    movl (%ecx,%esi), %edi +; X86-NEXT:    btl %edx, %edi +; X86-NEXT:    setb %al +; X86-NEXT:    btcl %edx, %edi +; X86-NEXT:    movl %edi, (%ecx,%esi)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ;  ; X64-LABEL: complement_ne_i64: @@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {  define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {  ; X86-LABEL: reset_eq_i64:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx  ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx -; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT:    movl $1, %esi -; X86-NEXT:    xorl %edi, %edi -; X86-NEXT:    shldl %cl, %esi, %edi -; X86-NEXT:    shll %cl, %esi -; X86-NEXT:    testb $32, %cl -; X86-NEXT:    je .LBB7_2 -; X86-NEXT:  # %bb.1: -; X86-NEXT:    movl %esi, %edi -; X86-NEXT:    xorl %esi, %esi -; X86-NEXT:  .LBB7_2: -; X86-NEXT:    movl (%edx), %eax -; X86-NEXT:    movl 4(%edx), %ecx -; X86-NEXT:    movl %ecx, %ebx -; X86-NEXT:    andl %edi, %ebx -; X86-NEXT:    notl %edi -; X86-NEXT:    movl %eax, %ebp -; X86-NEXT:    andl %esi, %ebp -; X86-NEXT:    notl %esi -; X86-NEXT:    andl %ecx, %edi -; X86-NEXT:    andl %eax, %esi -; X86-NEXT:    orl %ebx, %ebp -; X86-NEXT:    sete %al -; X86-NEXT:    movl %esi, (%edx) -; X86-NEXT:    movl %edi, 4(%edx) +; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    andl $32, %esi +; X86-NEXT:    shrl $3, %esi +; X86-NEXT:    movl (%ecx,%esi), %edi +; X86-NEXT:    btl %edx, %edi +; X86-NEXT:    setae %al +; X86-NEXT:    btrl %edx, %edi +; X86-NEXT:    movl %edi, (%ecx,%esi)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ;  ; X64-LABEL: reset_eq_i64: @@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {  define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {  ; X86-LABEL: set_ne_i64:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx  ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx -; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT:    movl $1, %eax -; X86-NEXT:    xorl %esi, %esi -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    shll %cl, %eax -; X86-NEXT:    testb $32, %cl -; X86-NEXT:    je .LBB8_2 -; X86-NEXT:  # %bb.1: -; X86-NEXT:    movl %eax, %esi -; X86-NEXT:    xorl %eax, %eax -; X86-NEXT:  .LBB8_2: -; X86-NEXT:    movl (%edx), %ecx -; X86-NEXT:    movl 4(%edx), %edi -; X86-NEXT:    movl %edi, %ebx -; X86-NEXT:    andl %esi, %ebx -; X86-NEXT:    movl %ecx, %ebp -; X86-NEXT:    andl %eax, %ebp -; X86-NEXT:    orl %esi, %edi -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    orl %ebx, %ebp -; X86-NEXT:    setne %al -; X86-NEXT:    movl %ecx, (%edx) -; X86-NEXT:    movl %edi, 4(%edx) +; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    andl $32, %esi +; X86-NEXT:    shrl $3, %esi +; X86-NEXT:    movl (%ecx,%esi), %edi +; X86-NEXT:    btl %edx, %edi +; X86-NEXT:    setb %al +; X86-NEXT:    btsl %edx, %edi +; X86-NEXT:    movl %edi, (%ecx,%esi)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ;  ; X64-LABEL: set_ne_i64: @@ -419,52 +353,47 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {  define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; X86-LABEL: init_eq_i64:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp  ; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi -; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT:    movl $1, %eax -; X86-NEXT:    xorl %edx, %edx -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    shll %cl, %eax -; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl $1, %edx +; X86-NEXT:    xorl %esi, %esi +; X86-NEXT:    shldl %cl, %edx, %esi +; X86-NEXT:    shll %cl, %edx +; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax  ; X86-NEXT:    xorl %edi, %edi -; X86-NEXT:    shldl %cl, %esi, %edi -; X86-NEXT:    shll %cl, %esi +; X86-NEXT:    shldl %cl, %eax, %edi +; X86-NEXT:    shll %cl, %eax  ; X86-NEXT:    testb $32, %cl  ; X86-NEXT:    je .LBB9_2  ; X86-NEXT:  # %bb.1: -; X86-NEXT:    movl %eax, %edx -; X86-NEXT:    movl $0, %eax +; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    movl $0, %edx  ; X86-NEXT:  .LBB9_2: -; X86-NEXT:    movl %edx, %ebx -; X86-NEXT:    notl %ebx -; X86-NEXT:    movl %eax, %ebp -; X86-NEXT:    notl %ebp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT:    notl %esi +; X86-NEXT:    notl %edx  ; X86-NEXT:    je .LBB9_4  ; X86-NEXT:  # %bb.3: -; X86-NEXT:    movl %esi, %edi -; X86-NEXT:    xorl %esi, %esi +; X86-NEXT:    movl %eax, %edi +; X86-NEXT:    xorl %eax, %eax  ; X86-NEXT:  .LBB9_4: -; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT:    movl 4(%ecx), %ecx -; X86-NEXT:    andl %ecx, %edx -; X86-NEXT:    andl %ecx, %ebx -; X86-NEXT:    orl %edi, %ebx -; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi -; X86-NEXT:    movl (%edi), %ecx -; X86-NEXT:    andl %ecx, %eax -; X86-NEXT:    andl %ecx, %ebp -; X86-NEXT:    orl %esi, %ebp -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    movl %ebp, (%edi) -; X86-NEXT:    movl %ebx, 4(%edi) -; X86-NEXT:    sete %al +; X86-NEXT:    andl 4(%ebx), %esi +; X86-NEXT:    orl %edi, %esi +; X86-NEXT:    andl (%ebx), %edx +; X86-NEXT:    orl %eax, %edx +; X86-NEXT:    movl %ecx, %eax +; X86-NEXT:    andl $32, %eax +; X86-NEXT:    shrl $3, %eax +; X86-NEXT:    movl (%ebx,%eax), %eax +; X86-NEXT:    btl %ecx, %eax +; X86-NEXT:    setae %al +; X86-NEXT:    movl %esi, 4(%ebx) +; X86-NEXT:    movl %edx, (%ebx)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi  ; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ;  ; SSE-LABEL: init_eq_i64: @@ -516,101 +445,25 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {  define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {  ; X86-LABEL: test_ne_i128:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx -; X86-NEXT:    pushl %edi -; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $48, %esp -; X86-NEXT:    movzbl 12(%ebp), %ecx -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $1, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, (%esp) -; X86-NEXT:    movl %ecx, %eax -; X86-NEXT:    shrb $3, %al -; X86-NEXT:    andb $12, %al -; X86-NEXT:    negb %al -; X86-NEXT:    movsbl %al, %esi -; X86-NEXT:    movl 24(%esp,%esi), %edi -; X86-NEXT:    movl 28(%esp,%esi), %eax -; X86-NEXT:    shldl %cl, %edi, %eax -; X86-NEXT:    movl 16(%esp,%esi), %edx -; X86-NEXT:    movl 20(%esp,%esi), %esi -; X86-NEXT:    shldl %cl, %esi, %edi -; X86-NEXT:    shldl %cl, %edx, %esi -; X86-NEXT:    movl 8(%ebp), %ebx -; X86-NEXT:    shll %cl, %edx -; X86-NEXT:    andl 8(%ebx), %edi -; X86-NEXT:    andl (%ebx), %edx -; X86-NEXT:    orl %edi, %edx -; X86-NEXT:    andl 12(%ebx), %eax -; X86-NEXT:    andl 4(%ebx), %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    orl %edx, %esi -; X86-NEXT:    setne %al -; X86-NEXT:    leal -12(%ebp), %esp -; X86-NEXT:    popl %esi -; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl %ecx, %edx +; X86-NEXT:    andl $96, %edx +; X86-NEXT:    shrl $3, %edx +; X86-NEXT:    movl (%eax,%edx), %eax +; X86-NEXT:    btl %ecx, %eax +; X86-NEXT:    setb %al  ; X86-NEXT:    retl  ; -; SSE-LABEL: test_ne_i128: -; SSE:       # %bb.0: -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    movl $1, %eax -; SSE-NEXT:    xorl %edx, %edx -; SSE-NEXT:    shldq %cl, %rax, %rdx -; SSE-NEXT:    xorl %esi, %esi -; SSE-NEXT:    shlq %cl, %rax -; SSE-NEXT:    testb $64, %cl -; SSE-NEXT:    cmovneq %rax, %rdx -; SSE-NEXT:    cmovneq %rsi, %rax -; SSE-NEXT:    andq 8(%rdi), %rdx -; SSE-NEXT:    andq (%rdi), %rax -; SSE-NEXT:    orq %rdx, %rax -; SSE-NEXT:    setne %al -; SSE-NEXT:    retq -; -; AVX2-LABEL: test_ne_i128: -; AVX2:       # %bb.0: -; AVX2-NEXT:    movl %esi, %ecx -; AVX2-NEXT:    xorl %eax, %eax -; AVX2-NEXT:    movl $1, %edx -; AVX2-NEXT:    xorl %esi, %esi -; AVX2-NEXT:    shldq %cl, %rdx, %rsi -; AVX2-NEXT:    shlxq %rcx, %rdx, %rdx -; AVX2-NEXT:    testb $64, %cl -; AVX2-NEXT:    cmovneq %rdx, %rsi -; AVX2-NEXT:    cmovneq %rax, %rdx -; AVX2-NEXT:    andq 8(%rdi), %rsi -; AVX2-NEXT:    andq (%rdi), %rdx -; AVX2-NEXT:    orq %rsi, %rdx -; AVX2-NEXT:    setne %al -; AVX2-NEXT:    retq -; -; AVX512-LABEL: test_ne_i128: -; AVX512:       # %bb.0: -; AVX512-NEXT:    movl %esi, %ecx -; AVX512-NEXT:    movl $1, %eax -; AVX512-NEXT:    xorl %edx, %edx -; AVX512-NEXT:    shldq %cl, %rax, %rdx -; AVX512-NEXT:    xorl %esi, %esi -; AVX512-NEXT:    shlxq %rcx, %rax, %rax -; AVX512-NEXT:    testb $64, %cl -; AVX512-NEXT:    cmovneq %rax, %rdx -; AVX512-NEXT:    cmovneq %rsi, %rax -; AVX512-NEXT:    andq 8(%rdi), %rdx -; AVX512-NEXT:    andq (%rdi), %rax -; AVX512-NEXT:    orq %rdx, %rax -; AVX512-NEXT:    setne %al -; AVX512-NEXT:    retq +; X64-LABEL: test_ne_i128: +; X64:       # %bb.0: +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    andl $96, %eax +; X64-NEXT:    shrl $3, %eax +; X64-NEXT:    movl (%rdi,%rax), %eax +; X64-NEXT:    btl %esi, %eax +; X64-NEXT:    setb %al +; X64-NEXT:    retq    %rem = and i32 %position, 127    %ofs = zext nneg i32 %rem to i128    %bit = shl nuw i128 1, %ofs @@ -623,124 +476,33 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {  define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {  ; X86-LABEL: complement_ne_i128:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $80, %esp -; X86-NEXT:    movzbl 12(%ebp), %ecx -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $1, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ecx, %eax -; X86-NEXT:    shrb $3, %al -; X86-NEXT:    andb $12, %al -; X86-NEXT:    negb %al -; X86-NEXT:    movsbl %al, %eax -; X86-NEXT:    movl 56(%esp,%eax), %esi -; X86-NEXT:    movl 60(%esp,%eax), %edx -; X86-NEXT:    shldl %cl, %esi, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 48(%esp,%eax), %edi -; X86-NEXT:    movl 52(%esp,%eax), %ebx -; X86-NEXT:    shldl %cl, %ebx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edi, %ebx -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    shll %cl, %edi -; X86-NEXT:    movl %eax, %ecx -; X86-NEXT:    movl 8(%eax), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %eax -; X86-NEXT:    movl (%ecx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %ecx, %esi -; X86-NEXT:    movl %edx, %ecx -; X86-NEXT:    andl %edi, %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl 12(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 4(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %eax -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    orl %ecx, %eax -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    movl %edx, 8(%eax) -; X86-NEXT:    movl %esi, 12(%eax) -; X86-NEXT:    movl %edi, (%eax) -; X86-NEXT:    movl %ebx, 4(%eax) -; X86-NEXT:    setne %al -; X86-NEXT:    leal -12(%ebp), %esp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    andl $96, %esi +; X86-NEXT:    shrl $3, %esi +; X86-NEXT:    movl (%ecx,%esi), %edi +; X86-NEXT:    btl %edx, %edi +; X86-NEXT:    setb %al +; X86-NEXT:    btcl %edx, %edi +; X86-NEXT:    movl %edi, (%ecx,%esi)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ; -; SSE-LABEL: complement_ne_i128: -; SSE:       # %bb.0: -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    movl $1, %edx -; SSE-NEXT:    xorl %esi, %esi -; SSE-NEXT:    shldq %cl, %rdx, %rsi -; SSE-NEXT:    shlq %cl, %rdx -; SSE-NEXT:    xorl %eax, %eax -; SSE-NEXT:    testb $64, %cl -; SSE-NEXT:    cmovneq %rdx, %rsi -; SSE-NEXT:    cmovneq %rax, %rdx -; SSE-NEXT:    movq (%rdi), %rax -; SSE-NEXT:    movq 8(%rdi), %rcx -; SSE-NEXT:    movq %rcx, %r8 -; SSE-NEXT:    andq %rsi, %r8 -; SSE-NEXT:    movq %rax, %r9 -; SSE-NEXT:    andq %rdx, %r9 -; SSE-NEXT:    xorq %rcx, %rsi -; SSE-NEXT:    xorq %rax, %rdx -; SSE-NEXT:    orq %r8, %r9 -; SSE-NEXT:    setne %al -; SSE-NEXT:    movq %rdx, (%rdi) -; SSE-NEXT:    movq %rsi, 8(%rdi) -; SSE-NEXT:    retq -; -; AVX-LABEL: complement_ne_i128: -; AVX:       # %bb.0: -; AVX-NEXT:    movl %esi, %ecx -; AVX-NEXT:    xorl %eax, %eax -; AVX-NEXT:    movl $1, %edx -; AVX-NEXT:    xorl %esi, %esi -; AVX-NEXT:    shldq %cl, %rdx, %rsi -; AVX-NEXT:    shlxq %rcx, %rdx, %rdx -; AVX-NEXT:    testb $64, %cl -; AVX-NEXT:    cmovneq %rdx, %rsi -; AVX-NEXT:    cmovneq %rax, %rdx -; AVX-NEXT:    movq (%rdi), %rax -; AVX-NEXT:    movq 8(%rdi), %rcx -; AVX-NEXT:    movq %rcx, %r8 -; AVX-NEXT:    andq %rsi, %r8 -; AVX-NEXT:    movq %rax, %r9 -; AVX-NEXT:    andq %rdx, %r9 -; AVX-NEXT:    xorq %rcx, %rsi -; AVX-NEXT:    xorq %rax, %rdx -; AVX-NEXT:    orq %r8, %r9 -; AVX-NEXT:    setne %al -; AVX-NEXT:    movq %rdx, (%rdi) -; AVX-NEXT:    movq %rsi, 8(%rdi) -; AVX-NEXT:    retq +; X64-LABEL: complement_ne_i128: +; X64:       # %bb.0: +; X64-NEXT:    movl %esi, %ecx +; X64-NEXT:    andl $96, %ecx +; X64-NEXT:    shrl $3, %ecx +; X64-NEXT:    movl (%rdi,%rcx), %edx +; X64-NEXT:    btl %esi, %edx +; X64-NEXT:    setb %al +; X64-NEXT:    btcl %esi, %edx +; X64-NEXT:    movl %edx, (%rdi,%rcx) +; X64-NEXT:    retq    %rem = and i32 %position, 127    %ofs = zext nneg i32 %rem to i128    %bit = shl nuw i128 1, %ofs @@ -755,124 +517,33 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {  define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {  ; X86-LABEL: reset_eq_i128:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $80, %esp -; X86-NEXT:    movzbl 12(%ebp), %ecx -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $1, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ecx, %eax -; X86-NEXT:    shrb $3, %al -; X86-NEXT:    andb $12, %al -; X86-NEXT:    negb %al -; X86-NEXT:    movsbl %al, %eax -; X86-NEXT:    movl 56(%esp,%eax), %edx -; X86-NEXT:    movl 60(%esp,%eax), %esi -; X86-NEXT:    shldl %cl, %edx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 48(%esp,%eax), %esi -; X86-NEXT:    movl 52(%esp,%eax), %edi -; X86-NEXT:    shldl %cl, %edi, %edx -; X86-NEXT:    shldl %cl, %esi, %edi -; X86-NEXT:    movl 8(%ebp), %ebx -; X86-NEXT:    shll %cl, %esi -; X86-NEXT:    movl 8(%ebx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %eax -; X86-NEXT:    movl (%ebx), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%ebx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl %edi, %ecx -; X86-NEXT:    movl 4(%ebx), %ebx -; X86-NEXT:    andl %ebx, %edi -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    notl %eax -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    notl %ecx -; X86-NEXT:    andl %ebx, %ecx -; X86-NEXT:    notl %esi -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    movl 8(%ebp), %edi -; X86-NEXT:    movl %edx, 8(%edi) -; X86-NEXT:    movl %eax, 12(%edi) -; X86-NEXT:    movl %esi, (%edi) -; X86-NEXT:    movl %ecx, 4(%edi) -; X86-NEXT:    sete %al -; X86-NEXT:    leal -12(%ebp), %esp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    andl $96, %esi +; X86-NEXT:    shrl $3, %esi +; X86-NEXT:    movl (%ecx,%esi), %edi +; X86-NEXT:    btl %edx, %edi +; X86-NEXT:    setae %al +; X86-NEXT:    btrl %edx, %edi +; X86-NEXT:    movl %edi, (%ecx,%esi)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ; -; SSE-LABEL: reset_eq_i128: -; SSE:       # %bb.0: -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    movl $1, %edx -; SSE-NEXT:    xorl %esi, %esi -; SSE-NEXT:    shldq %cl, %rdx, %rsi -; SSE-NEXT:    xorl %eax, %eax -; SSE-NEXT:    shlq %cl, %rdx -; SSE-NEXT:    testb $64, %cl -; SSE-NEXT:    cmovneq %rdx, %rsi -; SSE-NEXT:    cmovneq %rax, %rdx -; SSE-NEXT:    movq (%rdi), %rax -; SSE-NEXT:    movq 8(%rdi), %rcx -; SSE-NEXT:    movq %rcx, %r8 -; SSE-NEXT:    andq %rsi, %r8 -; SSE-NEXT:    notq %rsi -; SSE-NEXT:    movq %rax, %r9 -; SSE-NEXT:    andq %rdx, %r9 -; SSE-NEXT:    notq %rdx -; SSE-NEXT:    andq %rcx, %rsi -; SSE-NEXT:    andq %rax, %rdx -; SSE-NEXT:    orq %r8, %r9 -; SSE-NEXT:    sete %al -; SSE-NEXT:    movq %rdx, (%rdi) -; SSE-NEXT:    movq %rsi, 8(%rdi) -; SSE-NEXT:    retq -; -; AVX-LABEL: reset_eq_i128: -; AVX:       # %bb.0: -; AVX-NEXT:    movl %esi, %ecx -; AVX-NEXT:    xorl %eax, %eax -; AVX-NEXT:    movl $1, %edx -; AVX-NEXT:    xorl %esi, %esi -; AVX-NEXT:    shldq %cl, %rdx, %rsi -; AVX-NEXT:    shlxq %rcx, %rdx, %rdx -; AVX-NEXT:    testb $64, %cl -; AVX-NEXT:    cmovneq %rdx, %rsi -; AVX-NEXT:    cmovneq %rax, %rdx -; AVX-NEXT:    movq (%rdi), %rax -; AVX-NEXT:    movq 8(%rdi), %rcx -; AVX-NEXT:    andnq %rcx, %rsi, %r8 -; AVX-NEXT:    andq %rsi, %rcx -; AVX-NEXT:    andnq %rax, %rdx, %rsi -; AVX-NEXT:    andq %rdx, %rax -; AVX-NEXT:    orq %rcx, %rax -; AVX-NEXT:    sete %al -; AVX-NEXT:    movq %rsi, (%rdi) -; AVX-NEXT:    movq %r8, 8(%rdi) -; AVX-NEXT:    retq +; X64-LABEL: reset_eq_i128: +; X64:       # %bb.0: +; X64-NEXT:    movl %esi, %ecx +; X64-NEXT:    andl $96, %ecx +; X64-NEXT:    shrl $3, %ecx +; X64-NEXT:    movl (%rdi,%rcx), %edx +; X64-NEXT:    btl %esi, %edx +; X64-NEXT:    setae %al +; X64-NEXT:    btrl %esi, %edx +; X64-NEXT:    movl %edx, (%rdi,%rcx) +; X64-NEXT:    retq    %rem = and i32 %position, 127    %ofs = zext nneg i32 %rem to i128    %bit = shl nuw i128 1, %ofs @@ -888,124 +559,33 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {  define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {  ; X86-LABEL: set_ne_i128:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $80, %esp -; X86-NEXT:    movzbl 12(%ebp), %ecx -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $1, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ecx, %eax -; X86-NEXT:    shrb $3, %al -; X86-NEXT:    andb $12, %al -; X86-NEXT:    negb %al -; X86-NEXT:    movsbl %al, %eax -; X86-NEXT:    movl 56(%esp,%eax), %esi -; X86-NEXT:    movl 60(%esp,%eax), %edx -; X86-NEXT:    shldl %cl, %esi, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 48(%esp,%eax), %edi -; X86-NEXT:    movl 52(%esp,%eax), %ebx -; X86-NEXT:    shldl %cl, %ebx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edi, %ebx -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    shll %cl, %edi -; X86-NEXT:    movl %eax, %ecx -; X86-NEXT:    movl 8(%eax), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %eax -; X86-NEXT:    movl (%ecx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %ecx, %esi -; X86-NEXT:    movl %edx, %ecx -; X86-NEXT:    andl %edi, %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl 12(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 4(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %eax -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    orl %ecx, %eax -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    movl %edx, 8(%eax) -; X86-NEXT:    movl %esi, 12(%eax) -; X86-NEXT:    movl %edi, (%eax) -; X86-NEXT:    movl %ebx, 4(%eax) -; X86-NEXT:    setne %al -; X86-NEXT:    leal -12(%ebp), %esp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    andl $96, %esi +; X86-NEXT:    shrl $3, %esi +; X86-NEXT:    movl (%ecx,%esi), %edi +; X86-NEXT:    btl %edx, %edi +; X86-NEXT:    setb %al +; X86-NEXT:    btsl %edx, %edi +; X86-NEXT:    movl %edi, (%ecx,%esi)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ; -; SSE-LABEL: set_ne_i128: -; SSE:       # %bb.0: -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    movl $1, %edx -; SSE-NEXT:    xorl %esi, %esi -; SSE-NEXT:    shldq %cl, %rdx, %rsi -; SSE-NEXT:    shlq %cl, %rdx -; SSE-NEXT:    xorl %eax, %eax -; SSE-NEXT:    testb $64, %cl -; SSE-NEXT:    cmovneq %rdx, %rsi -; SSE-NEXT:    cmovneq %rax, %rdx -; SSE-NEXT:    movq (%rdi), %rax -; SSE-NEXT:    movq 8(%rdi), %rcx -; SSE-NEXT:    movq %rcx, %r8 -; SSE-NEXT:    andq %rsi, %r8 -; SSE-NEXT:    movq %rax, %r9 -; SSE-NEXT:    andq %rdx, %r9 -; SSE-NEXT:    orq %rcx, %rsi -; SSE-NEXT:    orq %rax, %rdx -; SSE-NEXT:    orq %r8, %r9 -; SSE-NEXT:    setne %al -; SSE-NEXT:    movq %rdx, (%rdi) -; SSE-NEXT:    movq %rsi, 8(%rdi) -; SSE-NEXT:    retq -; -; AVX-LABEL: set_ne_i128: -; AVX:       # %bb.0: -; AVX-NEXT:    movl %esi, %ecx -; AVX-NEXT:    xorl %eax, %eax -; AVX-NEXT:    movl $1, %edx -; AVX-NEXT:    xorl %esi, %esi -; AVX-NEXT:    shldq %cl, %rdx, %rsi -; AVX-NEXT:    shlxq %rcx, %rdx, %rdx -; AVX-NEXT:    testb $64, %cl -; AVX-NEXT:    cmovneq %rdx, %rsi -; AVX-NEXT:    cmovneq %rax, %rdx -; AVX-NEXT:    movq (%rdi), %rax -; AVX-NEXT:    movq 8(%rdi), %rcx -; AVX-NEXT:    movq %rcx, %r8 -; AVX-NEXT:    andq %rsi, %r8 -; AVX-NEXT:    movq %rax, %r9 -; AVX-NEXT:    andq %rdx, %r9 -; AVX-NEXT:    orq %rcx, %rsi -; AVX-NEXT:    orq %rax, %rdx -; AVX-NEXT:    orq %r8, %r9 -; AVX-NEXT:    setne %al -; AVX-NEXT:    movq %rdx, (%rdi) -; AVX-NEXT:    movq %rsi, 8(%rdi) -; AVX-NEXT:    retq +; X64-LABEL: set_ne_i128: +; X64:       # %bb.0: +; X64-NEXT:    movl %esi, %ecx +; X64-NEXT:    andl $96, %ecx +; X64-NEXT:    shrl $3, %ecx +; X64-NEXT:    movl (%rdi,%rcx), %edx +; X64-NEXT:    btl %esi, %edx +; X64-NEXT:    setb %al +; X64-NEXT:    btsl %esi, %edx +; X64-NEXT:    movl %edx, (%rdi,%rcx) +; X64-NEXT:    retq    %rem = and i32 %position, 127    %ofs = zext nneg i32 %rem to i128    %bit = shl nuw i128 1, %ofs @@ -1026,9 +606,9 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi  ; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $128, %esp -; X86-NEXT:    movzbl 12(%ebp), %ecx -; X86-NEXT:    movzbl 16(%ebp), %eax +; X86-NEXT:    subl $96, %esp +; X86-NEXT:    movl 12(%ebp), %ecx +; X86-NEXT:    movzbl 16(%ebp), %ebx  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) @@ -1037,25 +617,29 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ecx, %edx -; X86-NEXT:    shrb $3, %dl -; X86-NEXT:    andb $12, %dl -; X86-NEXT:    negb %dl -; X86-NEXT:    movsbl %dl, %esi -; X86-NEXT:    movl 64(%esp,%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 68(%esp,%esi), %edx +; X86-NEXT:    movl %ecx, %eax +; X86-NEXT:    shrb $3, %al +; X86-NEXT:    andb $12, %al +; X86-NEXT:    negb %al +; X86-NEXT:    movsbl %al, %edi +; X86-NEXT:    movl 72(%esp,%edi), %edx +; X86-NEXT:    movl 76(%esp,%edi), %esi +; X86-NEXT:    movzbl %bl, %eax +; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 64(%esp,%edi), %ebx +; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT:    movl 68(%esp,%edi), %ebx +; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT:    shldl %cl, %edx, %esi +; X86-NEXT:    shldl %cl, %ebx, %edx  ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 72(%esp,%esi), %ebx +; X86-NEXT:    movl (%esp), %eax # 4-byte Reload +; X86-NEXT:    shldl %cl, %eax, %ebx +; X86-NEXT:    # kill: def $cl killed $cl killed $ecx +; X86-NEXT:    shll %cl, %eax +; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill +; X86-NEXT:    notl %esi  ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movzbl %al, %eax -; X86-NEXT:    movl 76(%esp,%esi), %edi -; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT:    movl %ebx, %eax -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    shldl %cl, %ebx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shll %cl, %edx  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) @@ -1063,72 +647,53 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 8(%ebp), %esi -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%esi), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %eax -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl (%esi), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %edi, %esi +; X86-NEXT:    movl 40(%esp,%eax), %edi +; X86-NEXT:    movl 44(%esp,%eax), %esi +; X86-NEXT:    movl 12(%ebp), %ecx +; X86-NEXT:    # kill: def $cl killed $cl killed $ecx +; X86-NEXT:    shldl %cl, %edi, %esi  ; X86-NEXT:    movl 8(%ebp), %ecx -; X86-NEXT:    movl 12(%ecx), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %edi -; X86-NEXT:    movl %eax, %ebx -; X86-NEXT:    movl %eax, %edx -; X86-NEXT:    movl 4(%ecx), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ecx, %ebx -; X86-NEXT:    orl %edi, %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    notl %ecx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl 100(%esp,%ecx), %edi -; X86-NEXT:    movl 104(%esp,%ecx), %ecx -; X86-NEXT:    movl %ecx, %ebx -; X86-NEXT:    movl %ecx, %eax -; X86-NEXT:    movzbl 12(%ebp), %ecx -; X86-NEXT:    shldl %cl, %edi, %ebx -; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    notl %esi -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    movl 108(%esp,%ebx), %ebx -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    orl %ebx, %esi +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    andl 12(%ecx), %eax +; X86-NEXT:    orl %esi, %eax +; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload  ; X86-NEXT:    notl %eax -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    movl 96(%esp,%ebx), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shll %cl, %ebx -; X86-NEXT:    orl %ebx, %eax +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT:    movl 36(%esp,%esi), %esi +; X86-NEXT:    movl 12(%ebp), %ecx +; X86-NEXT:    shldl %cl, %esi, %edi +; X86-NEXT:    movl 8(%ebp), %edx +; X86-NEXT:    andl 8(%edx), %eax +; X86-NEXT:    orl %edi, %eax +; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    movl 32(%esp,%eax), %eax +; X86-NEXT:    shldl %cl, %eax, %esi +; X86-NEXT:    movl 8(%ebp), %edi +; X86-NEXT:    andl 4(%edi), %ebx +; X86-NEXT:    orl %esi, %ebx +; X86-NEXT:    movl (%esp), %edx # 4-byte Reload  ; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    shldl %cl, %ebx, %edi -; X86-NEXT:    orl %edi, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    movl 8(%ebp), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 8(%ecx) -; X86-NEXT:    movl %esi, 12(%ecx) -; X86-NEXT:    movl %eax, (%ecx) -; X86-NEXT:    movl %edx, 4(%ecx) -; X86-NEXT:    sete %al +; X86-NEXT:    # kill: def $cl killed $cl killed $ecx +; X86-NEXT:    shll %cl, %eax +; X86-NEXT:    andl (%edi), %edx +; X86-NEXT:    orl %eax, %edx +; X86-NEXT:    movl 12(%ebp), %ecx +; X86-NEXT:    movl %ecx, %eax +; X86-NEXT:    andl $96, %eax +; X86-NEXT:    shrl $3, %eax +; X86-NEXT:    movl (%edi,%eax), %eax +; X86-NEXT:    btl %ecx, %eax +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    movl %eax, 12(%edi) +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    movl %eax, 8(%edi) +; X86-NEXT:    movl %ebx, 4(%edi) +; X86-NEXT:    movl %edx, (%edi) +; X86-NEXT:    setae %al  ; X86-NEXT:    leal -12(%ebp), %esp  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi @@ -1151,86 +716,84 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; SSE-NEXT:    testb $64, %cl  ; SSE-NEXT:    cmovneq %rsi, %r8  ; SSE-NEXT:    cmovneq %r9, %rsi +; SSE-NEXT:    notq %r8  ; SSE-NEXT:    cmovneq %rax, %rdx  ; SSE-NEXT:    cmovneq %r9, %rax -; SSE-NEXT:    movq (%rdi), %rcx -; SSE-NEXT:    movq 8(%rdi), %r9 -; SSE-NEXT:    movq %r9, %r10 -; SSE-NEXT:    andq %r8, %r10 -; SSE-NEXT:    notq %r8 -; SSE-NEXT:    movq %rcx, %r11 -; SSE-NEXT:    andq %rsi, %r11  ; SSE-NEXT:    notq %rsi -; SSE-NEXT:    andq %r9, %r8 +; SSE-NEXT:    andq 8(%rdi), %r8  ; SSE-NEXT:    orq %rdx, %r8 -; SSE-NEXT:    andq %rcx, %rsi +; SSE-NEXT:    andq (%rdi), %rsi  ; SSE-NEXT:    orq %rax, %rsi -; SSE-NEXT:    orq %r10, %r11 -; SSE-NEXT:    sete %al -; SSE-NEXT:    movq %rsi, (%rdi) +; SSE-NEXT:    movl %ecx, %eax +; SSE-NEXT:    andl $96, %eax +; SSE-NEXT:    shrl $3, %eax +; SSE-NEXT:    movl (%rdi,%rax), %eax +; SSE-NEXT:    btl %ecx, %eax +; SSE-NEXT:    setae %al  ; SSE-NEXT:    movq %r8, 8(%rdi) +; SSE-NEXT:    movq %rsi, (%rdi)  ; SSE-NEXT:    retq  ;  ; AVX2-LABEL: init_eq_i128:  ; AVX2:       # %bb.0:  ; AVX2-NEXT:    movl %esi, %ecx -; AVX2-NEXT:    movl $1, %esi -; AVX2-NEXT:    xorl %eax, %eax -; AVX2-NEXT:    shldq %cl, %rsi, %rax -; AVX2-NEXT:    xorl %r8d, %r8d +; AVX2-NEXT:    movl $1, %eax +; AVX2-NEXT:    xorl %esi, %esi +; AVX2-NEXT:    shldq %cl, %rax, %rsi  ; AVX2-NEXT:    movl %edx, %edx +; AVX2-NEXT:    xorl %r8d, %r8d +; AVX2-NEXT:    shldq %cl, %rdx, %r8  ; AVX2-NEXT:    xorl %r9d, %r9d -; AVX2-NEXT:    shldq %cl, %rdx, %r9 -; AVX2-NEXT:    shlxq %rcx, %rsi, %rsi +; AVX2-NEXT:    shlxq %rcx, %rax, %rax  ; AVX2-NEXT:    testb $64, %cl -; AVX2-NEXT:    cmovneq %rsi, %rax -; AVX2-NEXT:    cmovneq %r8, %rsi -; AVX2-NEXT:    shlxq %rcx, %rdx, %rcx -; AVX2-NEXT:    cmovneq %rcx, %r9 -; AVX2-NEXT:    cmovneq %r8, %rcx -; AVX2-NEXT:    movq (%rdi), %rdx -; AVX2-NEXT:    movq 8(%rdi), %r8 -; AVX2-NEXT:    andnq %r8, %rax, %r10 -; AVX2-NEXT:    andq %rax, %r8 -; AVX2-NEXT:    andnq %rdx, %rsi, %r11 -; AVX2-NEXT:    andq %rsi, %rdx -; AVX2-NEXT:    orq %r9, %r10 -; AVX2-NEXT:    orq %rcx, %r11 -; AVX2-NEXT:    orq %r8, %rdx -; AVX2-NEXT:    sete %al -; AVX2-NEXT:    movq %r11, (%rdi) -; AVX2-NEXT:    movq %r10, 8(%rdi) +; AVX2-NEXT:    cmovneq %rax, %rsi +; AVX2-NEXT:    cmovneq %r9, %rax +; AVX2-NEXT:    shlxq %rcx, %rdx, %rdx +; AVX2-NEXT:    cmovneq %rdx, %r8 +; AVX2-NEXT:    cmovneq %r9, %rdx +; AVX2-NEXT:    andnq 8(%rdi), %rsi, %rsi +; AVX2-NEXT:    orq %r8, %rsi +; AVX2-NEXT:    andnq (%rdi), %rax, %r8 +; AVX2-NEXT:    orq %rdx, %r8 +; AVX2-NEXT:    movl %ecx, %eax +; AVX2-NEXT:    andl $96, %eax +; AVX2-NEXT:    shrl $3, %eax +; AVX2-NEXT:    movl (%rdi,%rax), %eax +; AVX2-NEXT:    btl %ecx, %eax +; AVX2-NEXT:    setae %al +; AVX2-NEXT:    movq %rsi, 8(%rdi) +; AVX2-NEXT:    movq %r8, (%rdi)  ; AVX2-NEXT:    retq  ;  ; AVX512-LABEL: init_eq_i128:  ; AVX512:       # %bb.0:  ; AVX512-NEXT:    movl %esi, %ecx -; AVX512-NEXT:    xorl %eax, %eax -; AVX512-NEXT:    movl $1, %esi +; AVX512-NEXT:    movl $1, %eax +; AVX512-NEXT:    xorl %esi, %esi +; AVX512-NEXT:    shldq %cl, %rax, %rsi  ; AVX512-NEXT:    xorl %r8d, %r8d -; AVX512-NEXT:    shldq %cl, %rsi, %r8 -; AVX512-NEXT:    shlxq %rcx, %rsi, %rsi +; AVX512-NEXT:    shlxq %rcx, %rax, %rax  ; AVX512-NEXT:    movl %edx, %edx  ; AVX512-NEXT:    xorl %r9d, %r9d  ; AVX512-NEXT:    shldq %cl, %rdx, %r9  ; AVX512-NEXT:    testb $64, %cl -; AVX512-NEXT:    cmovneq %rsi, %r8  ; AVX512-NEXT:    cmovneq %rax, %rsi -; AVX512-NEXT:    shlxq %rcx, %rdx, %rcx -; AVX512-NEXT:    cmovneq %rcx, %r9 -; AVX512-NEXT:    cmovneq %rax, %rcx -; AVX512-NEXT:    movq (%rdi), %rax -; AVX512-NEXT:    movq 8(%rdi), %rdx -; AVX512-NEXT:    andnq %rdx, %r8, %r10 -; AVX512-NEXT:    andq %r8, %rdx -; AVX512-NEXT:    andnq %rax, %rsi, %r8 -; AVX512-NEXT:    andq %rsi, %rax -; AVX512-NEXT:    orq %r9, %r10 -; AVX512-NEXT:    orq %rcx, %r8 -; AVX512-NEXT:    orq %rdx, %rax -; AVX512-NEXT:    sete %al +; AVX512-NEXT:    cmovneq %r8, %rax +; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx +; AVX512-NEXT:    cmovneq %rdx, %r9 +; AVX512-NEXT:    cmovneq %r8, %rdx +; AVX512-NEXT:    andnq 8(%rdi), %rsi, %rsi +; AVX512-NEXT:    orq %r9, %rsi +; AVX512-NEXT:    andnq (%rdi), %rax, %r8 +; AVX512-NEXT:    orq %rdx, %r8 +; AVX512-NEXT:    movl %ecx, %eax +; AVX512-NEXT:    andl $96, %eax +; AVX512-NEXT:    shrl $3, %eax +; AVX512-NEXT:    movl (%rdi,%rax), %eax +; AVX512-NEXT:    btl %ecx, %eax +; AVX512-NEXT:    setae %al +; AVX512-NEXT:    movq %rsi, 8(%rdi)  ; AVX512-NEXT:    movq %r8, (%rdi) -; AVX512-NEXT:    movq %r10, 8(%rdi)  ; AVX512-NEXT:    retq    %rem = and i32 %position, 127    %ofs = zext nneg i32 %rem to i128 @@ -1252,344 +815,25 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {  define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {  ; X86-LABEL: test_ne_i512:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx -; X86-NEXT:    pushl %edi -; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $224, %esp -; X86-NEXT:    movl 12(%ebp), %ecx -; X86-NEXT:    movl %ecx, %eax -; X86-NEXT:    shrl $3, %eax -; X86-NEXT:    andl $60, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx -; X86-NEXT:    subl %eax, %edx -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $1, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 24(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl $31, %ecx -; X86-NEXT:    shldl %cl, %esi, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 56(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 60(%edx), %eax -; X86-NEXT:    shldl %cl, %esi, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %esi, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%edx), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edi, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%edx), %edi -; X86-NEXT:    movl %edi, %ebx -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%edx), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %esi, %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl 52(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl 4(%edx), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    shldl %cl, %edi, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    shldl %cl, %edi, %eax -; X86-NEXT:    movl 8(%ebp), %ebx -; X86-NEXT:    andl 40(%ebx), %eax -; X86-NEXT:    andl 8(%ebx), %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 56(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 24(%ebx), %edi -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    orl %esi, %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %ebx, %edi -; X86-NEXT:    andl 44(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 12(%ebx), %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    movl %esi, %ebx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 60(%edi), %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 28(%edi), %eax -; X86-NEXT:    orl %esi, %eax -; X86-NEXT:    orl %ebx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl (%edx), %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shldl %cl, %edx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    negl %edx -; X86-NEXT:    movl 192(%esp,%edx), %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    shldl %cl, %ebx, %edx -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shll %cl, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl 8(%ebp), %ebx -; X86-NEXT:    andl 32(%ebx), %ecx -; X86-NEXT:    andl (%ebx), %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    andl 16(%ebx), %edi -; X86-NEXT:    andl 48(%ebx), %edx -; X86-NEXT:    orl %edi, %edx -; X86-NEXT:    orl %esi, %edx -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 36(%ebx), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 4(%ebx), %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 20(%ebx), %ecx -; X86-NEXT:    andl 52(%ebx), %eax -; X86-NEXT:    orl %ecx, %eax -; X86-NEXT:    orl %esi, %eax -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    setne %al -; X86-NEXT:    leal -12(%ebp), %esp -; X86-NEXT:    popl %esi -; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl %ecx, %edx +; X86-NEXT:    shrl $3, %edx +; X86-NEXT:    andl $60, %edx +; X86-NEXT:    movl (%eax,%edx), %eax +; X86-NEXT:    btl %ecx, %eax +; X86-NEXT:    setb %al  ; X86-NEXT:    retl  ; -; SSE-LABEL: test_ne_i512: -; SSE:       # %bb.0: -; SSE-NEXT:    pushq %r15 -; SSE-NEXT:    pushq %r14 -; SSE-NEXT:    pushq %rbx -; SSE-NEXT:    xorps %xmm0, %xmm0 -; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    andl $63, %ecx -; SSE-NEXT:    shrl $3, %esi -; SSE-NEXT:    andl $56, %esi -; SSE-NEXT:    negl %esi -; SSE-NEXT:    movslq %esi, %rbx -; SSE-NEXT:    movq -48(%rsp,%rbx), %rdx -; SSE-NEXT:    movq -40(%rsp,%rbx), %r14 -; SSE-NEXT:    movq %r14, %rax -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq -16(%rsp,%rbx), %r11 -; SSE-NEXT:    movq -8(%rsp,%rbx), %r10 -; SSE-NEXT:    shldq %cl, %r11, %r10 -; SSE-NEXT:    movq -32(%rsp,%rbx), %r9 -; SSE-NEXT:    movq -24(%rsp,%rbx), %r15 -; SSE-NEXT:    movq %r15, %r8 -; SSE-NEXT:    shldq %cl, %r9, %r8 -; SSE-NEXT:    movq -56(%rsp,%rbx), %rsi -; SSE-NEXT:    shldq %cl, %rsi, %rdx -; SSE-NEXT:    shldq %cl, %r15, %r11 -; SSE-NEXT:    shldq %cl, %r14, %r9 -; SSE-NEXT:    movq -64(%rsp,%rbx), %rbx -; SSE-NEXT:    shldq %cl, %rbx, %rsi -; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx -; SSE-NEXT:    shlq %cl, %rbx -; SSE-NEXT:    andq 32(%rdi), %r9 -; SSE-NEXT:    andq 48(%rdi), %r11 -; SSE-NEXT:    andq 16(%rdi), %rdx -; SSE-NEXT:    orq %r11, %rdx -; SSE-NEXT:    andq 40(%rdi), %r8 -; SSE-NEXT:    andq 56(%rdi), %r10 -; SSE-NEXT:    andq 24(%rdi), %rax -; SSE-NEXT:    orq %r10, %rax -; SSE-NEXT:    andq (%rdi), %rbx -; SSE-NEXT:    orq %r9, %rbx -; SSE-NEXT:    orq %rdx, %rbx -; SSE-NEXT:    andq 8(%rdi), %rsi -; SSE-NEXT:    orq %r8, %rsi -; SSE-NEXT:    orq %rax, %rsi -; SSE-NEXT:    orq %rbx, %rsi -; SSE-NEXT:    setne %al -; SSE-NEXT:    popq %rbx -; SSE-NEXT:    popq %r14 -; SSE-NEXT:    popq %r15 -; SSE-NEXT:    retq -; -; AVX2-LABEL: test_ne_i512: -; AVX2:       # %bb.0: -; AVX2-NEXT:    pushq %r15 -; AVX2-NEXT:    pushq %r14 -; AVX2-NEXT:    pushq %rbx -; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    movl %esi, %ecx -; AVX2-NEXT:    andl $63, %ecx -; AVX2-NEXT:    shrl $3, %esi -; AVX2-NEXT:    andl $56, %esi -; AVX2-NEXT:    negl %esi -; AVX2-NEXT:    movslq %esi, %rsi -; AVX2-NEXT:    movq -48(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq -40(%rsp,%rsi), %rbx -; AVX2-NEXT:    movq %rbx, %rax -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq -16(%rsp,%rsi), %r11 -; AVX2-NEXT:    movq -8(%rsp,%rsi), %r10 -; AVX2-NEXT:    shldq %cl, %r11, %r10 -; AVX2-NEXT:    movq -32(%rsp,%rsi), %r9 -; AVX2-NEXT:    movq -24(%rsp,%rsi), %r14 -; AVX2-NEXT:    movq %r14, %r8 -; AVX2-NEXT:    shldq %cl, %r9, %r8 -; AVX2-NEXT:    movq -64(%rsp,%rsi), %r15 -; AVX2-NEXT:    movq -56(%rsp,%rsi), %rsi -; AVX2-NEXT:    shldq %cl, %rsi, %rdx -; AVX2-NEXT:    shldq %cl, %r14, %r11 -; AVX2-NEXT:    shldq %cl, %rbx, %r9 -; AVX2-NEXT:    shldq %cl, %r15, %rsi -; AVX2-NEXT:    shlxq %rcx, %r15, %rcx -; AVX2-NEXT:    andq 32(%rdi), %r9 -; AVX2-NEXT:    andq 48(%rdi), %r11 -; AVX2-NEXT:    andq 16(%rdi), %rdx -; AVX2-NEXT:    andq 40(%rdi), %r8 -; AVX2-NEXT:    andq 56(%rdi), %r10 -; AVX2-NEXT:    andq 24(%rdi), %rax -; AVX2-NEXT:    orq %r11, %rdx -; AVX2-NEXT:    orq %r10, %rax -; AVX2-NEXT:    andq (%rdi), %rcx -; AVX2-NEXT:    orq %r9, %rcx -; AVX2-NEXT:    orq %rdx, %rcx -; AVX2-NEXT:    andq 8(%rdi), %rsi -; AVX2-NEXT:    orq %r8, %rsi -; AVX2-NEXT:    orq %rax, %rsi -; AVX2-NEXT:    orq %rcx, %rsi -; AVX2-NEXT:    setne %al -; AVX2-NEXT:    popq %rbx -; AVX2-NEXT:    popq %r14 -; AVX2-NEXT:    popq %r15 -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq -; -; AVX512-LABEL: test_ne_i512: -; AVX512:       # %bb.0: -; AVX512-NEXT:    pushq %r15 -; AVX512-NEXT:    pushq %r14 -; AVX512-NEXT:    pushq %rbx -; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    movl %esi, %ecx -; AVX512-NEXT:    andl $63, %ecx -; AVX512-NEXT:    shrl $3, %esi -; AVX512-NEXT:    andl $56, %esi -; AVX512-NEXT:    negl %esi -; AVX512-NEXT:    movslq %esi, %rbx -; AVX512-NEXT:    movq -48(%rsp,%rbx), %rdx -; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT:    movq %r14, %rax -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq -16(%rsp,%rbx), %r11 -; AVX512-NEXT:    movq -8(%rsp,%rbx), %r10 -; AVX512-NEXT:    shldq %cl, %r11, %r10 -; AVX512-NEXT:    movq -32(%rsp,%rbx), %r9 -; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT:    movq %r15, %r8 -; AVX512-NEXT:    shldq %cl, %r9, %r8 -; AVX512-NEXT:    movq -56(%rsp,%rbx), %rsi -; AVX512-NEXT:    shldq %cl, %rsi, %rdx -; AVX512-NEXT:    shldq %cl, %r15, %r11 -; AVX512-NEXT:    shldq %cl, %r14, %r9 -; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT:    shldq %cl, %rbx, %rsi -; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx -; AVX512-NEXT:    andq 32(%rdi), %r9 -; AVX512-NEXT:    andq 48(%rdi), %r11 -; AVX512-NEXT:    andq 16(%rdi), %rdx -; AVX512-NEXT:    andq 40(%rdi), %r8 -; AVX512-NEXT:    andq 56(%rdi), %r10 -; AVX512-NEXT:    andq 24(%rdi), %rax -; AVX512-NEXT:    orq %r11, %rdx -; AVX512-NEXT:    orq %r10, %rax -; AVX512-NEXT:    andq (%rdi), %rcx -; AVX512-NEXT:    orq %r9, %rcx -; AVX512-NEXT:    orq %rdx, %rcx -; AVX512-NEXT:    andq 8(%rdi), %rsi -; AVX512-NEXT:    orq %r8, %rsi -; AVX512-NEXT:    orq %rax, %rsi -; AVX512-NEXT:    orq %rcx, %rsi -; AVX512-NEXT:    setne %al -; AVX512-NEXT:    popq %rbx -; AVX512-NEXT:    popq %r14 -; AVX512-NEXT:    popq %r15 -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; X64-LABEL: test_ne_i512: +; X64:       # %bb.0: +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    shrl $3, %eax +; X64-NEXT:    andl $60, %eax +; X64-NEXT:    movl (%rdi,%rax), %eax +; X64-NEXT:    btl %esi, %eax +; X64-NEXT:    setb %al +; X64-NEXT:    retq    %rem = and i32 %position, 511    %ofs = zext nneg i32 %rem to i512    %bit = shl nuw i512 1, %ofs @@ -1602,572 +846,33 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {  define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {  ; X86-LABEL: complement_ne_i512:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $272, %esp # imm = 0x110 -; X86-NEXT:    movl 12(%ebp), %ecx -; X86-NEXT:    movl %ecx, %eax -; X86-NEXT:    shrl $3, %eax -; X86-NEXT:    andl $60, %eax -; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill -; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx -; X86-NEXT:    subl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $1, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 24(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl $31, %ecx -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 56(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 60(%edx), %esi -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%edx), %ebx -; X86-NEXT:    movl %ebx, %esi -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%edx), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edi, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl 52(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    shldl %cl, %esi, %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 4(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%ebp), %edx -; X86-NEXT:    movl 40(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %eax -; X86-NEXT:    movl 8(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    movl %edx, %eax -; X86-NEXT:    movl 56(%edx), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edi, %ebx -; X86-NEXT:    movl 24(%edx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    orl %esi, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%eax), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl 12(%eax), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT:    orl %esi, %ebx -; X86-NEXT:    movl 60(%eax), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %eax, %esi -; X86-NEXT:    movl %edx, %eax -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl 28(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl (%eax), %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl (%esp), %eax # 4-byte Reload -; X86-NEXT:    negl %eax -; X86-NEXT:    movl 240(%esp,%eax), %esi -; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shll %cl, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%ebp), %esi -; X86-NEXT:    movl 32(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edi, %eax -; X86-NEXT:    movl (%esi), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl 16(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %eax -; X86-NEXT:    movl 48(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 36(%esi), %ebx -; X86-NEXT:    movl %ebx, %eax -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl 4(%esi), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl %esi, %eax -; X86-NEXT:    movl 20(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl %esi, %edi -; X86-NEXT:    movl 52(%eax), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    orl %edi, %eax -; X86-NEXT:    orl %ecx, %eax -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    movl 8(%ebp), %edx -; X86-NEXT:    movl %ebx, 60(%edx) -; X86-NEXT:    movl %edi, 56(%edx) -; X86-NEXT:    movl %ecx, 52(%edx) -; X86-NEXT:    movl %esi, 44(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 40(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 36(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 32(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 28(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 24(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 20(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 16(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 12(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 8(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 4(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, (%edx) -; X86-NEXT:    movl (%esp), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 48(%edx) -; X86-NEXT:    setne %al -; X86-NEXT:    leal -12(%ebp), %esp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    shrl $3, %esi +; X86-NEXT:    andl $60, %esi +; X86-NEXT:    movl (%ecx,%esi), %edi +; X86-NEXT:    btl %edx, %edi +; X86-NEXT:    setb %al +; X86-NEXT:    btcl %edx, %edi +; X86-NEXT:    movl %edi, (%ecx,%esi)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ; -; SSE-LABEL: complement_ne_i512: -; SSE:       # %bb.0: -; SSE-NEXT:    pushq %rbp -; SSE-NEXT:    pushq %r15 -; SSE-NEXT:    pushq %r14 -; SSE-NEXT:    pushq %r13 -; SSE-NEXT:    pushq %r12 -; SSE-NEXT:    pushq %rbx -; SSE-NEXT:    subq $56, %rsp -; SSE-NEXT:    xorps %xmm0, %xmm0 -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    andl $63, %ecx -; SSE-NEXT:    shrl $3, %esi -; SSE-NEXT:    andl $56, %esi -; SSE-NEXT:    negl %esi -; SSE-NEXT:    movslq %esi, %rbx -; SSE-NEXT:    movq (%rsp,%rbx), %rsi -; SSE-NEXT:    movq 8(%rsp,%rbx), %r14 -; SSE-NEXT:    movq %r14, %rax -; SSE-NEXT:    shldq %cl, %rsi, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 32(%rsp,%rbx), %r8 -; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp -; SSE-NEXT:    shldq %cl, %r8, %rbp -; SSE-NEXT:    movq 16(%rsp,%rbx), %r9 -; SSE-NEXT:    movq 24(%rsp,%rbx), %r15 -; SSE-NEXT:    movq %r15, %r10 -; SSE-NEXT:    shldq %cl, %r9, %r10 -; SSE-NEXT:    movq -8(%rsp,%rbx), %r11 -; SSE-NEXT:    shldq %cl, %r11, %rsi -; SSE-NEXT:    shldq %cl, %r15, %r8 -; SSE-NEXT:    shldq %cl, %r14, %r9 -; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx -; SSE-NEXT:    shldq %cl, %rbx, %r11 -; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx -; SSE-NEXT:    shlq %cl, %rbx -; SSE-NEXT:    movq 24(%rdi), %r15 -; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 56(%rdi), %rcx -; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 16(%rdi), %r12 -; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 48(%rdi), %r13 -; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %r8, %r13 -; SSE-NEXT:    andq %rsi, %r12 -; SSE-NEXT:    orq %r13, %r12 -; SSE-NEXT:    movq %rcx, %r13 -; SSE-NEXT:    andq %rbp, %r13 -; SSE-NEXT:    andq %rax, %r15 -; SSE-NEXT:    orq %r13, %r15 -; SSE-NEXT:    movq 32(%rdi), %r14 -; SSE-NEXT:    movq %r14, %rcx -; SSE-NEXT:    andq %r9, %rcx -; SSE-NEXT:    movq (%rdi), %r13 -; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rbx, %r13 -; SSE-NEXT:    orq %rcx, %r13 -; SSE-NEXT:    orq %r12, %r13 -; SSE-NEXT:    movq 40(%rdi), %rcx -; SSE-NEXT:    movq %rcx, %r12 -; SSE-NEXT:    andq %r10, %r12 -; SSE-NEXT:    movq 8(%rdi), %rdx -; SSE-NEXT:    movq %rdx, %rax -; SSE-NEXT:    andq %r11, %rax -; SSE-NEXT:    orq %r12, %rax -; SSE-NEXT:    orq %r15, %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT:    xorq %rcx, %r10 -; SSE-NEXT:    xorq %r14, %r9 -; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT:    xorq %rdx, %r11 -; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT:    orq %r13, %rax -; SSE-NEXT:    movq %r8, 48(%rdi) -; SSE-NEXT:    movq %rbp, 56(%rdi) -; SSE-NEXT:    movq %r9, 32(%rdi) -; SSE-NEXT:    movq %r10, 40(%rdi) -; SSE-NEXT:    movq %rsi, 16(%rdi) -; SSE-NEXT:    movq %r15, 24(%rdi) -; SSE-NEXT:    movq %rbx, (%rdi) -; SSE-NEXT:    movq %r11, 8(%rdi) -; SSE-NEXT:    setne %al -; SSE-NEXT:    addq $56, %rsp -; SSE-NEXT:    popq %rbx -; SSE-NEXT:    popq %r12 -; SSE-NEXT:    popq %r13 -; SSE-NEXT:    popq %r14 -; SSE-NEXT:    popq %r15 -; SSE-NEXT:    popq %rbp -; SSE-NEXT:    retq -; -; AVX2-LABEL: complement_ne_i512: -; AVX2:       # %bb.0: -; AVX2-NEXT:    pushq %rbp -; AVX2-NEXT:    pushq %r15 -; AVX2-NEXT:    pushq %r14 -; AVX2-NEXT:    pushq %r13 -; AVX2-NEXT:    pushq %r12 -; AVX2-NEXT:    pushq %rbx -; AVX2-NEXT:    subq $72, %rsp -; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT:    vmovups %ymm0, (%rsp) -; AVX2-NEXT:    movl %esi, %ecx -; AVX2-NEXT:    andl $63, %ecx -; AVX2-NEXT:    shrl $3, %esi -; AVX2-NEXT:    andl $56, %esi -; AVX2-NEXT:    negl %esi -; AVX2-NEXT:    movslq %esi, %rbx -; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT:    movq %rbp, %rax -; AVX2-NEXT:    shldq %cl, %rsi, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT:    shldq %cl, %r8, %r13 -; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT:    movq %r14, %r10 -; AVX2-NEXT:    shldq %cl, %r9, %r10 -; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT:    shldq %cl, %r11, %rsi -; AVX2-NEXT:    shldq %cl, %r14, %r8 -; AVX2-NEXT:    movq 16(%rdi), %r12 -; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 48(%rdi), %r14 -; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r8, %r14 -; AVX2-NEXT:    andq %rsi, %r12 -; AVX2-NEXT:    orq %r14, %r12 -; AVX2-NEXT:    movq 56(%rdi), %r15 -; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r13, %r15 -; AVX2-NEXT:    movq 24(%rdi), %r14 -; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %rax, %r14 -; AVX2-NEXT:    orq %r15, %r14 -; AVX2-NEXT:    shldq %cl, %rbp, %r9 -; AVX2-NEXT:    movq (%rsp,%rbx), %rdx -; AVX2-NEXT:    movq 32(%rdi), %r15 -; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r9, %r15 -; AVX2-NEXT:    shlxq %rcx, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq (%rdi), %rbx -; AVX2-NEXT:    movq %rbx, %rbp -; AVX2-NEXT:    andq %rax, %rbp -; AVX2-NEXT:    orq %r15, %rbp -; AVX2-NEXT:    orq %r12, %rbp -; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT:    shldq %cl, %rdx, %r11 -; AVX2-NEXT:    movq 40(%rdi), %rax -; AVX2-NEXT:    movq %rax, %rcx -; AVX2-NEXT:    andq %r10, %rcx -; AVX2-NEXT:    movq 8(%rdi), %r15 -; AVX2-NEXT:    movq %r15, %r12 -; AVX2-NEXT:    andq %r11, %r12 -; AVX2-NEXT:    orq %rcx, %r12 -; AVX2-NEXT:    orq %r14, %r12 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT:    xorq %rax, %r10 -; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT:    xorq %r15, %r11 -; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT:    orq %rbp, %r12 -; AVX2-NEXT:    movq %r8, 48(%rdi) -; AVX2-NEXT:    movq %r13, 56(%rdi) -; AVX2-NEXT:    movq %r9, 32(%rdi) -; AVX2-NEXT:    movq %r10, 40(%rdi) -; AVX2-NEXT:    movq %rsi, 16(%rdi) -; AVX2-NEXT:    movq %rcx, 24(%rdi) -; AVX2-NEXT:    movq %rbx, (%rdi) -; AVX2-NEXT:    movq %r11, 8(%rdi) -; AVX2-NEXT:    setne %al -; AVX2-NEXT:    addq $72, %rsp -; AVX2-NEXT:    popq %rbx -; AVX2-NEXT:    popq %r12 -; AVX2-NEXT:    popq %r13 -; AVX2-NEXT:    popq %r14 -; AVX2-NEXT:    popq %r15 -; AVX2-NEXT:    popq %rbp -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq -; -; AVX512-LABEL: complement_ne_i512: -; AVX512:       # %bb.0: -; AVX512-NEXT:    pushq %rbp -; AVX512-NEXT:    pushq %r15 -; AVX512-NEXT:    pushq %r14 -; AVX512-NEXT:    pushq %r13 -; AVX512-NEXT:    pushq %r12 -; AVX512-NEXT:    pushq %rbx -; AVX512-NEXT:    subq $72, %rsp -; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT:    vmovups %ymm0, (%rsp) -; AVX512-NEXT:    movl %esi, %ecx -; AVX512-NEXT:    andl $63, %ecx -; AVX512-NEXT:    shrl $3, %esi -; AVX512-NEXT:    andl $56, %esi -; AVX512-NEXT:    negl %esi -; AVX512-NEXT:    movslq %esi, %rbx -; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT:    movq %rbp, %rax -; AVX512-NEXT:    shldq %cl, %rsi, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT:    shldq %cl, %r8, %r13 -; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT:    movq %r14, %r10 -; AVX512-NEXT:    shldq %cl, %r9, %r10 -; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT:    shldq %cl, %r11, %rsi -; AVX512-NEXT:    shldq %cl, %r14, %r8 -; AVX512-NEXT:    movq 16(%rdi), %r12 -; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 48(%rdi), %r14 -; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %r8, %r14 -; AVX512-NEXT:    andq %rsi, %r12 -; AVX512-NEXT:    orq %r14, %r12 -; AVX512-NEXT:    movq 56(%rdi), %r15 -; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %r13, %r15 -; AVX512-NEXT:    movq 24(%rdi), %r14 -; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %rax, %r14 -; AVX512-NEXT:    orq %r15, %r14 -; AVX512-NEXT:    shldq %cl, %rbp, %r9 -; AVX512-NEXT:    movq (%rsp,%rbx), %rdx -; AVX512-NEXT:    movq 32(%rdi), %r15 -; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %r9, %r15 -; AVX512-NEXT:    shlxq %rcx, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq (%rdi), %rbx -; AVX512-NEXT:    movq %rbx, %rbp -; AVX512-NEXT:    andq %rax, %rbp -; AVX512-NEXT:    orq %r15, %rbp -; AVX512-NEXT:    orq %r12, %rbp -; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT:    shldq %cl, %rdx, %r11 -; AVX512-NEXT:    movq 40(%rdi), %rax -; AVX512-NEXT:    movq %rax, %rcx -; AVX512-NEXT:    andq %r10, %rcx -; AVX512-NEXT:    movq 8(%rdi), %r15 -; AVX512-NEXT:    movq %r15, %r12 -; AVX512-NEXT:    andq %r11, %r12 -; AVX512-NEXT:    orq %rcx, %r12 -; AVX512-NEXT:    orq %r14, %r12 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT:    xorq %rax, %r10 -; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT:    xorq %r15, %r11 -; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT:    orq %rbp, %r12 -; AVX512-NEXT:    movq %r8, 48(%rdi) -; AVX512-NEXT:    movq %r13, 56(%rdi) -; AVX512-NEXT:    movq %r9, 32(%rdi) -; AVX512-NEXT:    movq %r10, 40(%rdi) -; AVX512-NEXT:    movq %rsi, 16(%rdi) -; AVX512-NEXT:    movq %rcx, 24(%rdi) -; AVX512-NEXT:    movq %rbx, (%rdi) -; AVX512-NEXT:    movq %r11, 8(%rdi) -; AVX512-NEXT:    setne %al -; AVX512-NEXT:    addq $72, %rsp -; AVX512-NEXT:    popq %rbx -; AVX512-NEXT:    popq %r12 -; AVX512-NEXT:    popq %r13 -; AVX512-NEXT:    popq %r14 -; AVX512-NEXT:    popq %r15 -; AVX512-NEXT:    popq %rbp -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; X64-LABEL: complement_ne_i512: +; X64:       # %bb.0: +; X64-NEXT:    movl %esi, %ecx +; X64-NEXT:    shrl $3, %ecx +; X64-NEXT:    andl $60, %ecx +; X64-NEXT:    movl (%rdi,%rcx), %edx +; X64-NEXT:    btl %esi, %edx +; X64-NEXT:    setb %al +; X64-NEXT:    btcl %esi, %edx +; X64-NEXT:    movl %edx, (%rdi,%rcx) +; X64-NEXT:    retq    %rem = and i32 %position, 511    %ofs = zext nneg i32 %rem to i512    %bit = shl nuw i512 1, %ofs @@ -2182,606 +887,33 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {  define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {  ; X86-LABEL: reset_eq_i512:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $288, %esp # imm = 0x120 -; X86-NEXT:    movl 12(%ebp), %ecx -; X86-NEXT:    movl %ecx, %eax -; X86-NEXT:    shrl $3, %eax -; X86-NEXT:    andl $60, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi -; X86-NEXT:    subl %eax, %edi -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $1, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 4(%edi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%edi), %eax -; X86-NEXT:    andl $31, %ecx -; X86-NEXT:    movl %eax, %ebx -; X86-NEXT:    shldl %cl, %edx, %ebx -; X86-NEXT:    movl 12(%edi), %edx -; X86-NEXT:    movl %edx, %esi -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%edi), %eax -; X86-NEXT:    movl %eax, %esi -; X86-NEXT:    shldl %cl, %edx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%edi), %edx -; X86-NEXT:    movl %edx, %esi -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 24(%edi), %eax -; X86-NEXT:    movl %eax, %esi -; X86-NEXT:    shldl %cl, %edx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%edi), %edx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx  ; X86-NEXT:    movl %edx, %esi -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%edi), %eax -; X86-NEXT:    movl %eax, %esi -; X86-NEXT:    shldl %cl, %edx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%edi), %esi -; X86-NEXT:    movl %esi, %edx -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%edi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %esi, %edx -; X86-NEXT:    movl 8(%ebp), %esi -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %eax, %edx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %eax, %ebx -; X86-NEXT:    orl %edx, %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%edi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    movl 52(%edi), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 56(%edi), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shldl %cl, %esi, %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%ebp), %esi -; X86-NEXT:    movl 56(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %eax, %ebx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 24(%esi), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ecx, %eax -; X86-NEXT:    orl %ebx, %eax -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %esi, %ebx -; X86-NEXT:    movl 44(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %eax, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%esi), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ecx, %eax -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 60(%edi), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    shldl %cl, %esi, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 60(%ebx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%ebx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl (%edi), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    negl %eax -; X86-NEXT:    movl 256(%esp,%eax), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    shldl %cl, %edi, %eax -; X86-NEXT:    movl %esi, %edi -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shll %cl, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %ebx, %esi -; X86-NEXT:    movl 32(%ebx), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ecx, %edx -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl (%ebx), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ecx, %edi -; X86-NEXT:    orl %edx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%esi), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ecx, %ebx -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 48(%esi), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ecx, %eax -; X86-NEXT:    orl %ebx, %eax -; X86-NEXT:    orl %edi, %eax -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shldl %cl, %edx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%esi), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %edx -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 4(%esi), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %edi -; X86-NEXT:    orl %edx, %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    shldl %cl, %edi, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%esi), %edi -; X86-NEXT:    andl %edi, %ecx -; X86-NEXT:    movl %ecx, %esi -; X86-NEXT:    movl %edx, %ecx -; X86-NEXT:    movl 8(%ebp), %ebx -; X86-NEXT:    movl 52(%ebx), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %edx -; X86-NEXT:    orl %esi, %edx -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    notl %ebx -; X86-NEXT:    andl %edi, %ebx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    notl %esi -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    notl %edi -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    notl %edi -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    notl %edi -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    notl %ecx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    movl %edx, 60(%eax) -; X86-NEXT:    movl %esi, 56(%eax) -; X86-NEXT:    movl %ecx, 52(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 44(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 40(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 36(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 32(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 28(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 24(%eax) -; X86-NEXT:    movl %ebx, 20(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 16(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 12(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 8(%eax) -; X86-NEXT:    movl %edi, 4(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, (%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, 48(%eax) -; X86-NEXT:    sete %al -; X86-NEXT:    leal -12(%ebp), %esp +; X86-NEXT:    shrl $3, %esi +; X86-NEXT:    andl $60, %esi +; X86-NEXT:    movl (%ecx,%esi), %edi +; X86-NEXT:    btl %edx, %edi +; X86-NEXT:    setae %al +; X86-NEXT:    btrl %edx, %edi +; X86-NEXT:    movl %edi, (%ecx,%esi)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ; -; SSE-LABEL: reset_eq_i512: -; SSE:       # %bb.0: -; SSE-NEXT:    pushq %rbp -; SSE-NEXT:    pushq %r15 -; SSE-NEXT:    pushq %r14 -; SSE-NEXT:    pushq %r13 -; SSE-NEXT:    pushq %r12 -; SSE-NEXT:    pushq %rbx -; SSE-NEXT:    subq $56, %rsp -; SSE-NEXT:    xorps %xmm0, %xmm0 -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    andl $63, %ecx -; SSE-NEXT:    shrl $3, %esi -; SSE-NEXT:    andl $56, %esi -; SSE-NEXT:    negl %esi -; SSE-NEXT:    movslq %esi, %rdx -; SSE-NEXT:    movq (%rsp,%rdx), %r9 -; SSE-NEXT:    movq 8(%rsp,%rdx), %r8 -; SSE-NEXT:    movq %r8, %rsi -; SSE-NEXT:    shldq %cl, %r9, %rsi -; SSE-NEXT:    movq -8(%rsp,%rdx), %rax -; SSE-NEXT:    shldq %cl, %rax, %r9 -; SSE-NEXT:    movq 16(%rsp,%rdx), %r14 -; SSE-NEXT:    movq 24(%rsp,%rdx), %r10 -; SSE-NEXT:    movq %r10, %rbx -; SSE-NEXT:    shldq %cl, %r14, %rbx -; SSE-NEXT:    shldq %cl, %r8, %r14 -; SSE-NEXT:    movq 32(%rsp,%rdx), %r13 -; SSE-NEXT:    movq 40(%rsp,%rdx), %r12 -; SSE-NEXT:    shldq %cl, %r13, %r12 -; SSE-NEXT:    shldq %cl, %r10, %r13 -; SSE-NEXT:    movq -16(%rsp,%rdx), %rdx -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx -; SSE-NEXT:    shlq %cl, %rdx -; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq %r12, %rbp -; SSE-NEXT:    movq %r9, %r15 -; SSE-NEXT:    movq %rsi, %r11 -; SSE-NEXT:    movq 16(%rdi), %r8 -; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 48(%rdi), %rcx -; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rcx, %r13 -; SSE-NEXT:    andq %r8, %r9 -; SSE-NEXT:    orq %r13, %r9 -; SSE-NEXT:    movq 56(%rdi), %rcx -; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rcx, %r12 -; SSE-NEXT:    movq 24(%rdi), %r10 -; SSE-NEXT:    andq %r10, %rsi -; SSE-NEXT:    orq %r12, %rsi -; SSE-NEXT:    movq %r14, %r13 -; SSE-NEXT:    movq 32(%rdi), %rcx -; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rcx, %r14 -; SSE-NEXT:    movq %rdx, %r12 -; SSE-NEXT:    movq (%rdi), %rcx -; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rcx, %rdx -; SSE-NEXT:    orq %r14, %rdx -; SSE-NEXT:    orq %r9, %rdx -; SSE-NEXT:    movq %rbx, %r14 -; SSE-NEXT:    movq 40(%rdi), %rcx -; SSE-NEXT:    andq %rcx, %rbx -; SSE-NEXT:    movq %rax, %r9 -; SSE-NEXT:    movq 8(%rdi), %r8 -; SSE-NEXT:    andq %r8, %rax -; SSE-NEXT:    orq %rbx, %rax -; SSE-NEXT:    orq %rsi, %rax -; SSE-NEXT:    notq %r11 -; SSE-NEXT:    andq %r10, %r11 -; SSE-NEXT:    notq %r15 -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT:    notq %r14 -; SSE-NEXT:    andq %rcx, %r14 -; SSE-NEXT:    notq %r13 -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT:    notq %rbp -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT:    notq %rcx -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; SSE-NEXT:    notq %r9 -; SSE-NEXT:    andq %r8, %r9 -; SSE-NEXT:    notq %r12 -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload -; SSE-NEXT:    orq %rdx, %rax -; SSE-NEXT:    movq %rcx, 48(%rdi) -; SSE-NEXT:    movq %rbp, 56(%rdi) -; SSE-NEXT:    movq %r13, 32(%rdi) -; SSE-NEXT:    movq %r14, 40(%rdi) -; SSE-NEXT:    movq %r15, 16(%rdi) -; SSE-NEXT:    movq %r11, 24(%rdi) -; SSE-NEXT:    movq %r12, (%rdi) -; SSE-NEXT:    movq %r9, 8(%rdi) -; SSE-NEXT:    sete %al -; SSE-NEXT:    addq $56, %rsp -; SSE-NEXT:    popq %rbx -; SSE-NEXT:    popq %r12 -; SSE-NEXT:    popq %r13 -; SSE-NEXT:    popq %r14 -; SSE-NEXT:    popq %r15 -; SSE-NEXT:    popq %rbp -; SSE-NEXT:    retq -; -; AVX2-LABEL: reset_eq_i512: -; AVX2:       # %bb.0: -; AVX2-NEXT:    pushq %rbp -; AVX2-NEXT:    pushq %r15 -; AVX2-NEXT:    pushq %r14 -; AVX2-NEXT:    pushq %r13 -; AVX2-NEXT:    pushq %r12 -; AVX2-NEXT:    pushq %rbx -; AVX2-NEXT:    pushq %rax -; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    movl %esi, %ecx -; AVX2-NEXT:    andl $63, %ecx -; AVX2-NEXT:    shrl $3, %esi -; AVX2-NEXT:    andl $56, %esi -; AVX2-NEXT:    negl %esi -; AVX2-NEXT:    movslq %esi, %rdx -; AVX2-NEXT:    movq -48(%rsp,%rdx), %r8 -; AVX2-NEXT:    movq -40(%rsp,%rdx), %rbx -; AVX2-NEXT:    movq %rbx, %rax -; AVX2-NEXT:    shldq %cl, %r8, %rax -; AVX2-NEXT:    movq -16(%rsp,%rdx), %r10 -; AVX2-NEXT:    movq -8(%rsp,%rdx), %rsi -; AVX2-NEXT:    shldq %cl, %r10, %rsi -; AVX2-NEXT:    movq -32(%rsp,%rdx), %r11 -; AVX2-NEXT:    movq -24(%rsp,%rdx), %r14 -; AVX2-NEXT:    movq %r14, %r9 -; AVX2-NEXT:    shldq %cl, %r11, %r9 -; AVX2-NEXT:    movq -64(%rsp,%rdx), %r15 -; AVX2-NEXT:    movq -56(%rsp,%rdx), %rdx -; AVX2-NEXT:    shldq %cl, %rdx, %r8 -; AVX2-NEXT:    shldq %cl, %r14, %r10 -; AVX2-NEXT:    shldq %cl, %rbx, %r11 -; AVX2-NEXT:    shldq %cl, %r15, %rdx -; AVX2-NEXT:    shlxq %rcx, %r15, %rcx -; AVX2-NEXT:    movq 24(%rdi), %rbx -; AVX2-NEXT:    movq 56(%rdi), %r14 -; AVX2-NEXT:    movq 16(%rdi), %r15 -; AVX2-NEXT:    movq 48(%rdi), %r13 -; AVX2-NEXT:    movq 32(%rdi), %rbp -; AVX2-NEXT:    andnq %rbp, %r11, %r12 -; AVX2-NEXT:    andq %r11, %rbp -; AVX2-NEXT:    andnq %r13, %r10, %r11 -; AVX2-NEXT:    andq %r10, %r13 -; AVX2-NEXT:    andnq %r15, %r8, %r10 -; AVX2-NEXT:    andq %r8, %r15 -; AVX2-NEXT:    movq 40(%rdi), %r8 -; AVX2-NEXT:    orq %r13, %r15 -; AVX2-NEXT:    andnq %r8, %r9, %r13 -; AVX2-NEXT:    andq %r9, %r8 -; AVX2-NEXT:    andnq %r14, %rsi, %r9 -; AVX2-NEXT:    andq %rsi, %r14 -; AVX2-NEXT:    andnq %rbx, %rax, %rsi -; AVX2-NEXT:    andq %rax, %rbx -; AVX2-NEXT:    movq (%rdi), %rax -; AVX2-NEXT:    orq %r14, %rbx -; AVX2-NEXT:    andnq %rax, %rcx, %r14 -; AVX2-NEXT:    andq %rcx, %rax -; AVX2-NEXT:    orq %rbp, %rax -; AVX2-NEXT:    movq 8(%rdi), %rcx -; AVX2-NEXT:    orq %r15, %rax -; AVX2-NEXT:    andnq %rcx, %rdx, %r15 -; AVX2-NEXT:    andq %rdx, %rcx -; AVX2-NEXT:    orq %r8, %rcx -; AVX2-NEXT:    orq %rbx, %rcx -; AVX2-NEXT:    orq %rax, %rcx -; AVX2-NEXT:    movq %r11, 48(%rdi) -; AVX2-NEXT:    movq %r9, 56(%rdi) -; AVX2-NEXT:    movq %r12, 32(%rdi) -; AVX2-NEXT:    movq %r13, 40(%rdi) -; AVX2-NEXT:    movq %r10, 16(%rdi) -; AVX2-NEXT:    movq %rsi, 24(%rdi) -; AVX2-NEXT:    movq %r14, (%rdi) -; AVX2-NEXT:    movq %r15, 8(%rdi) -; AVX2-NEXT:    sete %al -; AVX2-NEXT:    addq $8, %rsp -; AVX2-NEXT:    popq %rbx -; AVX2-NEXT:    popq %r12 -; AVX2-NEXT:    popq %r13 -; AVX2-NEXT:    popq %r14 -; AVX2-NEXT:    popq %r15 -; AVX2-NEXT:    popq %rbp -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq -; -; AVX512-LABEL: reset_eq_i512: -; AVX512:       # %bb.0: -; AVX512-NEXT:    pushq %rbp -; AVX512-NEXT:    pushq %r15 -; AVX512-NEXT:    pushq %r14 -; AVX512-NEXT:    pushq %r13 -; AVX512-NEXT:    pushq %r12 -; AVX512-NEXT:    pushq %rbx -; AVX512-NEXT:    pushq %rax -; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    movl %esi, %ecx -; AVX512-NEXT:    andl $63, %ecx -; AVX512-NEXT:    shrl $3, %esi -; AVX512-NEXT:    andl $56, %esi -; AVX512-NEXT:    negl %esi -; AVX512-NEXT:    movslq %esi, %rbx -; AVX512-NEXT:    movq -48(%rsp,%rbx), %r8 -; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14 -; AVX512-NEXT:    movq %r14, %rax -; AVX512-NEXT:    shldq %cl, %r8, %rax -; AVX512-NEXT:    movq -16(%rsp,%rbx), %r10 -; AVX512-NEXT:    movq -8(%rsp,%rbx), %rsi -; AVX512-NEXT:    shldq %cl, %r10, %rsi -; AVX512-NEXT:    movq -32(%rsp,%rbx), %r11 -; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15 -; AVX512-NEXT:    movq %r15, %r9 -; AVX512-NEXT:    shldq %cl, %r11, %r9 -; AVX512-NEXT:    movq -56(%rsp,%rbx), %rdx -; AVX512-NEXT:    shldq %cl, %rdx, %r8 -; AVX512-NEXT:    shldq %cl, %r15, %r10 -; AVX512-NEXT:    shldq %cl, %r14, %r11 -; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx -; AVX512-NEXT:    shldq %cl, %rbx, %rdx -; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx -; AVX512-NEXT:    movq 24(%rdi), %rbx -; AVX512-NEXT:    movq 56(%rdi), %r14 -; AVX512-NEXT:    movq 16(%rdi), %r15 -; AVX512-NEXT:    movq 48(%rdi), %r13 -; AVX512-NEXT:    movq 32(%rdi), %rbp -; AVX512-NEXT:    andnq %rbp, %r11, %r12 -; AVX512-NEXT:    andq %r11, %rbp -; AVX512-NEXT:    andnq %r13, %r10, %r11 -; AVX512-NEXT:    andq %r10, %r13 -; AVX512-NEXT:    andnq %r15, %r8, %r10 -; AVX512-NEXT:    andq %r8, %r15 -; AVX512-NEXT:    movq 40(%rdi), %r8 -; AVX512-NEXT:    orq %r13, %r15 -; AVX512-NEXT:    andnq %r8, %r9, %r13 -; AVX512-NEXT:    andq %r9, %r8 -; AVX512-NEXT:    andnq %r14, %rsi, %r9 -; AVX512-NEXT:    andq %rsi, %r14 -; AVX512-NEXT:    andnq %rbx, %rax, %rsi -; AVX512-NEXT:    andq %rax, %rbx -; AVX512-NEXT:    movq (%rdi), %rax -; AVX512-NEXT:    orq %r14, %rbx -; AVX512-NEXT:    andnq %rax, %rcx, %r14 -; AVX512-NEXT:    andq %rcx, %rax -; AVX512-NEXT:    orq %rbp, %rax -; AVX512-NEXT:    movq 8(%rdi), %rcx -; AVX512-NEXT:    orq %r15, %rax -; AVX512-NEXT:    andnq %rcx, %rdx, %r15 -; AVX512-NEXT:    andq %rdx, %rcx -; AVX512-NEXT:    orq %r8, %rcx -; AVX512-NEXT:    orq %rbx, %rcx -; AVX512-NEXT:    orq %rax, %rcx -; AVX512-NEXT:    movq %r11, 48(%rdi) -; AVX512-NEXT:    movq %r9, 56(%rdi) -; AVX512-NEXT:    movq %r12, 32(%rdi) -; AVX512-NEXT:    movq %r13, 40(%rdi) -; AVX512-NEXT:    movq %r10, 16(%rdi) -; AVX512-NEXT:    movq %rsi, 24(%rdi) -; AVX512-NEXT:    movq %r14, (%rdi) -; AVX512-NEXT:    movq %r15, 8(%rdi) -; AVX512-NEXT:    sete %al -; AVX512-NEXT:    addq $8, %rsp -; AVX512-NEXT:    popq %rbx -; AVX512-NEXT:    popq %r12 -; AVX512-NEXT:    popq %r13 -; AVX512-NEXT:    popq %r14 -; AVX512-NEXT:    popq %r15 -; AVX512-NEXT:    popq %rbp -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; X64-LABEL: reset_eq_i512: +; X64:       # %bb.0: +; X64-NEXT:    movl %esi, %ecx +; X64-NEXT:    shrl $3, %ecx +; X64-NEXT:    andl $60, %ecx +; X64-NEXT:    movl (%rdi,%rcx), %edx +; X64-NEXT:    btl %esi, %edx +; X64-NEXT:    setae %al +; X64-NEXT:    btrl %esi, %edx +; X64-NEXT:    movl %edx, (%rdi,%rcx) +; X64-NEXT:    retq    %rem = and i32 %position, 511    %ofs = zext nneg i32 %rem to i512    %bit = shl nuw i512 1, %ofs @@ -2797,572 +929,33 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {  define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {  ; X86-LABEL: set_ne_i512:  ; X86:       # %bb.0: -; X86-NEXT:    pushl %ebp -; X86-NEXT:    movl %esp, %ebp -; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi -; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $272, %esp # imm = 0x110 -; X86-NEXT:    movl 12(%ebp), %ecx -; X86-NEXT:    movl %ecx, %eax -; X86-NEXT:    shrl $3, %eax -; X86-NEXT:    andl $60, %eax -; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill -; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx -; X86-NEXT:    subl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $1, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 24(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl $31, %ecx -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 56(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 60(%edx), %esi -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%edx), %ebx -; X86-NEXT:    movl %ebx, %esi -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%edx), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edi, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl 52(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    shldl %cl, %esi, %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 4(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%ebp), %edx -; X86-NEXT:    movl 40(%edx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %eax -; X86-NEXT:    movl 8(%edx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    movl %edx, %eax -; X86-NEXT:    movl 56(%edx), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edi, %ebx -; X86-NEXT:    movl 24(%edx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    orl %esi, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%eax), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl 12(%eax), %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT:    orl %esi, %ebx -; X86-NEXT:    movl 60(%eax), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %eax, %esi -; X86-NEXT:    movl %edx, %eax -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl 28(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl (%eax), %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl (%esp), %eax # 4-byte Reload -; X86-NEXT:    negl %eax -; X86-NEXT:    movl 240(%esp,%eax), %esi -; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %esi -; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shll %cl, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%ebp), %esi -; X86-NEXT:    movl 32(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edi, %eax -; X86-NEXT:    movl (%esi), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl 16(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ebx, %eax -; X86-NEXT:    movl 48(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 36(%esi), %ebx -; X86-NEXT:    movl %ebx, %eax -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl 4(%esi), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl %esi, %eax -; X86-NEXT:    movl 20(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl %esi, %edi -; X86-NEXT:    movl 52(%eax), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    orl %edi, %eax -; X86-NEXT:    orl %ecx, %eax -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl %ecx, (%esp) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    movl 8(%ebp), %edx -; X86-NEXT:    movl %ebx, 60(%edx) -; X86-NEXT:    movl %edi, 56(%edx) -; X86-NEXT:    movl %ecx, 52(%edx) -; X86-NEXT:    movl %esi, 44(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 40(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 36(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 32(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 28(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 24(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 20(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 16(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 12(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 8(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 4(%edx) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, (%edx) -; X86-NEXT:    movl (%esp), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, 48(%edx) -; X86-NEXT:    setne %al -; X86-NEXT:    leal -12(%ebp), %esp +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx +; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    shrl $3, %esi +; X86-NEXT:    andl $60, %esi +; X86-NEXT:    movl (%ecx,%esi), %edi +; X86-NEXT:    btl %edx, %edi +; X86-NEXT:    setb %al +; X86-NEXT:    btsl %edx, %edi +; X86-NEXT:    movl %edi, (%ecx,%esi)  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi -; X86-NEXT:    popl %ebx -; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ; -; SSE-LABEL: set_ne_i512: -; SSE:       # %bb.0: -; SSE-NEXT:    pushq %rbp -; SSE-NEXT:    pushq %r15 -; SSE-NEXT:    pushq %r14 -; SSE-NEXT:    pushq %r13 -; SSE-NEXT:    pushq %r12 -; SSE-NEXT:    pushq %rbx -; SSE-NEXT:    subq $56, %rsp -; SSE-NEXT:    xorps %xmm0, %xmm0 -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    andl $63, %ecx -; SSE-NEXT:    shrl $3, %esi -; SSE-NEXT:    andl $56, %esi -; SSE-NEXT:    negl %esi -; SSE-NEXT:    movslq %esi, %rbx -; SSE-NEXT:    movq (%rsp,%rbx), %rsi -; SSE-NEXT:    movq 8(%rsp,%rbx), %r14 -; SSE-NEXT:    movq %r14, %rax -; SSE-NEXT:    shldq %cl, %rsi, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 32(%rsp,%rbx), %r8 -; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp -; SSE-NEXT:    shldq %cl, %r8, %rbp -; SSE-NEXT:    movq 16(%rsp,%rbx), %r9 -; SSE-NEXT:    movq 24(%rsp,%rbx), %r15 -; SSE-NEXT:    movq %r15, %r10 -; SSE-NEXT:    shldq %cl, %r9, %r10 -; SSE-NEXT:    movq -8(%rsp,%rbx), %r11 -; SSE-NEXT:    shldq %cl, %r11, %rsi -; SSE-NEXT:    shldq %cl, %r15, %r8 -; SSE-NEXT:    shldq %cl, %r14, %r9 -; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx -; SSE-NEXT:    shldq %cl, %rbx, %r11 -; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx -; SSE-NEXT:    shlq %cl, %rbx -; SSE-NEXT:    movq 24(%rdi), %r15 -; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 56(%rdi), %rcx -; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 16(%rdi), %r12 -; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 48(%rdi), %r13 -; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %r8, %r13 -; SSE-NEXT:    andq %rsi, %r12 -; SSE-NEXT:    orq %r13, %r12 -; SSE-NEXT:    movq %rcx, %r13 -; SSE-NEXT:    andq %rbp, %r13 -; SSE-NEXT:    andq %rax, %r15 -; SSE-NEXT:    orq %r13, %r15 -; SSE-NEXT:    movq 32(%rdi), %r14 -; SSE-NEXT:    movq %r14, %rcx -; SSE-NEXT:    andq %r9, %rcx -; SSE-NEXT:    movq (%rdi), %r13 -; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rbx, %r13 -; SSE-NEXT:    orq %rcx, %r13 -; SSE-NEXT:    orq %r12, %r13 -; SSE-NEXT:    movq 40(%rdi), %rcx -; SSE-NEXT:    movq %rcx, %r12 -; SSE-NEXT:    andq %r10, %r12 -; SSE-NEXT:    movq 8(%rdi), %rdx -; SSE-NEXT:    movq %rdx, %rax -; SSE-NEXT:    andq %r11, %rax -; SSE-NEXT:    orq %r12, %rax -; SSE-NEXT:    orq %r15, %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT:    orq %rcx, %r10 -; SSE-NEXT:    orq %r14, %r9 -; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; SSE-NEXT:    orq %rdx, %r11 -; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; SSE-NEXT:    orq %r13, %rax -; SSE-NEXT:    movq %r8, 48(%rdi) -; SSE-NEXT:    movq %rbp, 56(%rdi) -; SSE-NEXT:    movq %r9, 32(%rdi) -; SSE-NEXT:    movq %r10, 40(%rdi) -; SSE-NEXT:    movq %rsi, 16(%rdi) -; SSE-NEXT:    movq %r15, 24(%rdi) -; SSE-NEXT:    movq %rbx, (%rdi) -; SSE-NEXT:    movq %r11, 8(%rdi) -; SSE-NEXT:    setne %al -; SSE-NEXT:    addq $56, %rsp -; SSE-NEXT:    popq %rbx -; SSE-NEXT:    popq %r12 -; SSE-NEXT:    popq %r13 -; SSE-NEXT:    popq %r14 -; SSE-NEXT:    popq %r15 -; SSE-NEXT:    popq %rbp -; SSE-NEXT:    retq -; -; AVX2-LABEL: set_ne_i512: -; AVX2:       # %bb.0: -; AVX2-NEXT:    pushq %rbp -; AVX2-NEXT:    pushq %r15 -; AVX2-NEXT:    pushq %r14 -; AVX2-NEXT:    pushq %r13 -; AVX2-NEXT:    pushq %r12 -; AVX2-NEXT:    pushq %rbx -; AVX2-NEXT:    subq $72, %rsp -; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT:    vmovups %ymm0, (%rsp) -; AVX2-NEXT:    movl %esi, %ecx -; AVX2-NEXT:    andl $63, %ecx -; AVX2-NEXT:    shrl $3, %esi -; AVX2-NEXT:    andl $56, %esi -; AVX2-NEXT:    negl %esi -; AVX2-NEXT:    movslq %esi, %rbx -; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi -; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp -; AVX2-NEXT:    movq %rbp, %rax -; AVX2-NEXT:    shldq %cl, %rsi, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8 -; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13 -; AVX2-NEXT:    shldq %cl, %r8, %r13 -; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9 -; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14 -; AVX2-NEXT:    movq %r14, %r10 -; AVX2-NEXT:    shldq %cl, %r9, %r10 -; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11 -; AVX2-NEXT:    shldq %cl, %r11, %rsi -; AVX2-NEXT:    shldq %cl, %r14, %r8 -; AVX2-NEXT:    movq 16(%rdi), %r12 -; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 48(%rdi), %r14 -; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r8, %r14 -; AVX2-NEXT:    andq %rsi, %r12 -; AVX2-NEXT:    orq %r14, %r12 -; AVX2-NEXT:    movq 56(%rdi), %r15 -; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r13, %r15 -; AVX2-NEXT:    movq 24(%rdi), %r14 -; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %rax, %r14 -; AVX2-NEXT:    orq %r15, %r14 -; AVX2-NEXT:    shldq %cl, %rbp, %r9 -; AVX2-NEXT:    movq (%rsp,%rbx), %rdx -; AVX2-NEXT:    movq 32(%rdi), %r15 -; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r9, %r15 -; AVX2-NEXT:    shlxq %rcx, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq (%rdi), %rbx -; AVX2-NEXT:    movq %rbx, %rbp -; AVX2-NEXT:    andq %rax, %rbp -; AVX2-NEXT:    orq %r15, %rbp -; AVX2-NEXT:    orq %r12, %rbp -; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT:    shldq %cl, %rdx, %r11 -; AVX2-NEXT:    movq 40(%rdi), %rax -; AVX2-NEXT:    movq %rax, %rcx -; AVX2-NEXT:    andq %r10, %rcx -; AVX2-NEXT:    movq 8(%rdi), %r15 -; AVX2-NEXT:    movq %r15, %r12 -; AVX2-NEXT:    andq %r11, %r12 -; AVX2-NEXT:    orq %rcx, %r12 -; AVX2-NEXT:    orq %r14, %r12 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX2-NEXT:    orq %rax, %r10 -; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX2-NEXT:    orq %r15, %r11 -; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX2-NEXT:    orq %rbp, %r12 -; AVX2-NEXT:    movq %r8, 48(%rdi) -; AVX2-NEXT:    movq %r13, 56(%rdi) -; AVX2-NEXT:    movq %r9, 32(%rdi) -; AVX2-NEXT:    movq %r10, 40(%rdi) -; AVX2-NEXT:    movq %rsi, 16(%rdi) -; AVX2-NEXT:    movq %rcx, 24(%rdi) -; AVX2-NEXT:    movq %rbx, (%rdi) -; AVX2-NEXT:    movq %r11, 8(%rdi) -; AVX2-NEXT:    setne %al -; AVX2-NEXT:    addq $72, %rsp -; AVX2-NEXT:    popq %rbx -; AVX2-NEXT:    popq %r12 -; AVX2-NEXT:    popq %r13 -; AVX2-NEXT:    popq %r14 -; AVX2-NEXT:    popq %r15 -; AVX2-NEXT:    popq %rbp -; AVX2-NEXT:    vzeroupper -; AVX2-NEXT:    retq -; -; AVX512-LABEL: set_ne_i512: -; AVX512:       # %bb.0: -; AVX512-NEXT:    pushq %rbp -; AVX512-NEXT:    pushq %r15 -; AVX512-NEXT:    pushq %r14 -; AVX512-NEXT:    pushq %r13 -; AVX512-NEXT:    pushq %r12 -; AVX512-NEXT:    pushq %rbx -; AVX512-NEXT:    subq $72, %rsp -; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT:    vmovups %ymm0, (%rsp) -; AVX512-NEXT:    movl %esi, %ecx -; AVX512-NEXT:    andl $63, %ecx -; AVX512-NEXT:    shrl $3, %esi -; AVX512-NEXT:    andl $56, %esi -; AVX512-NEXT:    negl %esi -; AVX512-NEXT:    movslq %esi, %rbx -; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi -; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp -; AVX512-NEXT:    movq %rbp, %rax -; AVX512-NEXT:    shldq %cl, %rsi, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8 -; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13 -; AVX512-NEXT:    shldq %cl, %r8, %r13 -; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9 -; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14 -; AVX512-NEXT:    movq %r14, %r10 -; AVX512-NEXT:    shldq %cl, %r9, %r10 -; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11 -; AVX512-NEXT:    shldq %cl, %r11, %rsi -; AVX512-NEXT:    shldq %cl, %r14, %r8 -; AVX512-NEXT:    movq 16(%rdi), %r12 -; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 48(%rdi), %r14 -; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %r8, %r14 -; AVX512-NEXT:    andq %rsi, %r12 -; AVX512-NEXT:    orq %r14, %r12 -; AVX512-NEXT:    movq 56(%rdi), %r15 -; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %r13, %r15 -; AVX512-NEXT:    movq 24(%rdi), %r14 -; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %rax, %r14 -; AVX512-NEXT:    orq %r15, %r14 -; AVX512-NEXT:    shldq %cl, %rbp, %r9 -; AVX512-NEXT:    movq (%rsp,%rbx), %rdx -; AVX512-NEXT:    movq 32(%rdi), %r15 -; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %r9, %r15 -; AVX512-NEXT:    shlxq %rcx, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq (%rdi), %rbx -; AVX512-NEXT:    movq %rbx, %rbp -; AVX512-NEXT:    andq %rax, %rbp -; AVX512-NEXT:    orq %r15, %rbp -; AVX512-NEXT:    orq %r12, %rbp -; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT:    shldq %cl, %rdx, %r11 -; AVX512-NEXT:    movq 40(%rdi), %rax -; AVX512-NEXT:    movq %rax, %rcx -; AVX512-NEXT:    andq %r10, %rcx -; AVX512-NEXT:    movq 8(%rdi), %r15 -; AVX512-NEXT:    movq %r15, %r12 -; AVX512-NEXT:    andq %r11, %r12 -; AVX512-NEXT:    orq %rcx, %r12 -; AVX512-NEXT:    orq %r14, %r12 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT:    orq %rax, %r10 -; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload -; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload -; AVX512-NEXT:    orq %r15, %r11 -; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload -; AVX512-NEXT:    orq %rbp, %r12 -; AVX512-NEXT:    movq %r8, 48(%rdi) -; AVX512-NEXT:    movq %r13, 56(%rdi) -; AVX512-NEXT:    movq %r9, 32(%rdi) -; AVX512-NEXT:    movq %r10, 40(%rdi) -; AVX512-NEXT:    movq %rsi, 16(%rdi) -; AVX512-NEXT:    movq %rcx, 24(%rdi) -; AVX512-NEXT:    movq %rbx, (%rdi) -; AVX512-NEXT:    movq %r11, 8(%rdi) -; AVX512-NEXT:    setne %al -; AVX512-NEXT:    addq $72, %rsp -; AVX512-NEXT:    popq %rbx -; AVX512-NEXT:    popq %r12 -; AVX512-NEXT:    popq %r13 -; AVX512-NEXT:    popq %r14 -; AVX512-NEXT:    popq %r15 -; AVX512-NEXT:    popq %rbp -; AVX512-NEXT:    vzeroupper -; AVX512-NEXT:    retq +; X64-LABEL: set_ne_i512: +; X64:       # %bb.0: +; X64-NEXT:    movl %esi, %ecx +; X64-NEXT:    shrl $3, %ecx +; X64-NEXT:    andl $60, %ecx +; X64-NEXT:    movl (%rdi,%rcx), %edx +; X64-NEXT:    btl %esi, %edx +; X64-NEXT:    setb %al +; X64-NEXT:    btsl %esi, %edx +; X64-NEXT:    movl %edx, (%rdi,%rcx) +; X64-NEXT:    retq    %rem = and i32 %position, 511    %ofs = zext nneg i32 %rem to i512    %bit = shl nuw i512 1, %ofs @@ -3383,13 +976,14 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi  ; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $432, %esp # imm = 0x1B0 +; X86-NEXT:    subl $352, %esp # imm = 0x160  ; X86-NEXT:    movl 12(%ebp), %ecx  ; X86-NEXT:    movl %ecx, %edx  ; X86-NEXT:    shrl $3, %edx  ; X86-NEXT:    andl $60, %edx -; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi -; X86-NEXT:    subl %edx, %esi +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax +; X86-NEXT:    subl %edx, %eax  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) @@ -3422,60 +1016,58 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 56(%esi), %eax +; X86-NEXT:    movl 56(%eax), %esi +; X86-NEXT:    movl 60(%eax), %ebx +; X86-NEXT:    movl 52(%eax), %edi +; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 48(%eax), %edi +; X86-NEXT:    movl 44(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 40(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 36(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 32(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 28(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 24(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 20(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 16(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 12(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 8(%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl (%eax), %edx +; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 4(%eax), %eax  ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 60(%esi), %eax +; X86-NEXT:    movzbl 16(%ebp), %eax +; X86-NEXT:    movzbl %al, %eax +; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT:    andl $31, %ecx +; X86-NEXT:    shldl %cl, %esi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    shldl %cl, %eax, %esi +; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    shldl %cl, %edi, %eax  ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 52(%esi), %eax -; X86-NEXT:    movl 48(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 24(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%esi), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl (%esi), %edi +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    shldl %cl, %ebx, %edi  ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 4(%esi), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movzbl 16(%ebp), %ebx -; X86-NEXT:    movzbl %bl, %esi -; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp) -; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi -; X86-NEXT:    subl %edx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl $31, %ecx  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT:    shldl %cl, %edx, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload  ; X86-NEXT:    shldl %cl, %eax, %edx  ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    shldl %cl, %ebx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    shldl %cl, %esi, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shldl %cl, %edx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    shldl %cl, %edx, %eax +; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload  ; X86-NEXT:    shldl %cl, %eax, %edx  ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -3500,9 +1092,12 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload  ; X86-NEXT:    shldl %cl, %eax, %esi  ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    movl %ebx, %edx -; X86-NEXT:    shldl %cl, %edi, %edx +; X86-NEXT:    shll %cl, %eax +; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax +; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) @@ -3534,273 +1129,148 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 8(%ebp), %ebx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 48(%ebx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %edx +; X86-NEXT:    movl 56(%eax), %esi +; X86-NEXT:    movl 60(%eax), %edi +; X86-NEXT:    shldl %cl, %esi, %edi +; X86-NEXT:    movl 8(%ebp), %edx +; X86-NEXT:    andl 60(%edx), %ebx +; X86-NEXT:    orl %edi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 52(%eax), %edi +; X86-NEXT:    shldl %cl, %edi, %esi +; X86-NEXT:    andl 56(%edx), %ebx +; X86-NEXT:    orl %esi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 48(%eax), %esi +; X86-NEXT:    shldl %cl, %esi, %edi +; X86-NEXT:    andl 52(%edx), %ebx +; X86-NEXT:    orl %edi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 44(%eax), %edi +; X86-NEXT:    shldl %cl, %edi, %esi +; X86-NEXT:    andl 48(%edx), %ebx +; X86-NEXT:    orl %esi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 40(%eax), %esi +; X86-NEXT:    shldl %cl, %esi, %edi +; X86-NEXT:    andl 44(%edx), %ebx +; X86-NEXT:    orl %edi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 36(%eax), %edi +; X86-NEXT:    shldl %cl, %edi, %esi +; X86-NEXT:    andl 40(%edx), %ebx +; X86-NEXT:    orl %esi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 32(%eax), %esi +; X86-NEXT:    shldl %cl, %esi, %edi +; X86-NEXT:    andl 36(%edx), %ebx +; X86-NEXT:    orl %edi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 28(%eax), %edi +; X86-NEXT:    shldl %cl, %edi, %esi +; X86-NEXT:    andl 32(%edx), %ebx +; X86-NEXT:    orl %esi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 24(%eax), %esi +; X86-NEXT:    shldl %cl, %esi, %edi +; X86-NEXT:    andl 28(%edx), %ebx +; X86-NEXT:    orl %edi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 20(%eax), %edi +; X86-NEXT:    shldl %cl, %edi, %esi +; X86-NEXT:    andl 24(%edx), %ebx +; X86-NEXT:    orl %esi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 16(%eax), %esi +; X86-NEXT:    shldl %cl, %esi, %edi +; X86-NEXT:    andl 20(%edx), %ebx +; X86-NEXT:    orl %edi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 12(%eax), %edi +; X86-NEXT:    shldl %cl, %edi, %esi +; X86-NEXT:    andl 16(%edx), %ebx +; X86-NEXT:    orl %esi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 8(%eax), %esi +; X86-NEXT:    shldl %cl, %esi, %edi +; X86-NEXT:    andl 12(%edx), %ebx +; X86-NEXT:    orl %edi, %ebx +; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 4(%eax), %edi +; X86-NEXT:    shldl %cl, %edi, %esi +; X86-NEXT:    andl 8(%edx), %ebx +; X86-NEXT:    orl %esi, %ebx  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%ebx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %eax, %esi -; X86-NEXT:    orl %edx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%ebx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %eax, %edx -; X86-NEXT:    movl %edx, %esi +; X86-NEXT:    notl %esi +; X86-NEXT:    movl (%eax), %eax +; X86-NEXT:    shldl %cl, %eax, %edi +; X86-NEXT:    andl 4(%edx), %esi +; X86-NEXT:    orl %edi, %esi +; X86-NEXT:    movl %esi, %edi +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT:    notl %esi +; X86-NEXT:    shll %cl, %eax +; X86-NEXT:    andl (%edx), %esi +; X86-NEXT:    orl %eax, %esi  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%ebx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %eax -; X86-NEXT:    orl %esi, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 56(%ebx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %edx -; X86-NEXT:    movl %edx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 24(%ebx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %edx -; X86-NEXT:    orl %edi, %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl (%edx,%eax), %eax +; X86-NEXT:    btl %ecx, %eax  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 52(%ebx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %eax -; X86-NEXT:    movl %eax, %edx +; X86-NEXT:    movl %eax, 60(%edx)  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%ebx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %eax -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl %eax, 56(%edx)  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%ebx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%ebx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %edi -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 60(%ebx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%ebx), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %eax, %edx -; X86-NEXT:    orl %esi, %edx -; X86-NEXT:    orl %edi, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT:    movl %eax, 52(%edx)  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shll %cl, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%ebx), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %esi, %ecx -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl (%ebx), %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edi, %eax -; X86-NEXT:    orl %ecx, %eax -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl %eax, 48(%edx)  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%ebx), %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %ecx, %eax -; X86-NEXT:    movl %eax, %ecx -; X86-NEXT:    movl %edx, %eax -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 4(%ebx), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl %edx, %eax -; X86-NEXT:    orl %ecx, %eax -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl 56(%edi), %ebx -; X86-NEXT:    movl 60(%edi), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    shldl %cl, %ebx, %eax -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 52(%edi), %eax -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 48(%edi), %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %esi, %eax -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl %eax, 44(%edx)  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    notl %eax -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl %eax, %edx -; X86-NEXT:    movl 40(%edi), %ebx -; X86-NEXT:    movl 44(%edi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %ebx, %eax -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 36(%edi), %eax -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 32(%edi), %ebx -; X86-NEXT:    shldl %cl, %ebx, %eax -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 28(%edi), %eax -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 24(%edi), %ebx -; X86-NEXT:    shldl %cl, %ebx, %eax -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 20(%edi), %eax -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 16(%edi), %ebx -; X86-NEXT:    shldl %cl, %ebx, %eax -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl 12(%edi), %eax -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    notl %esi -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    movl 8(%edi), %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl %eax, 40(%edx)  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    notl %eax -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl 4(%edi), %ebx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    shldl %cl, %ebx, %edx -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx +; X86-NEXT:    movl %eax, 36(%edx)  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    shldl %cl, %esi, %eax -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl %edx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    notl %edx -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT:    movl (%edi), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, %ebx -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    notl %edi -; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shll %cl, %eax -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    movl %edi, %ecx +; X86-NEXT:    movl %eax, 32(%edx) +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    movl %eax, 28(%edx)  ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 60(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 56(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 52(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 44(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 40(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 36(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 32(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 28(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 24(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 20(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 16(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 12(%eax) -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl %edi, 8(%eax) -; X86-NEXT:    movl %edx, 4(%eax) -; X86-NEXT:    movl %ecx, (%eax) -; X86-NEXT:    movl %esi, 48(%eax) -; X86-NEXT:    sete %al +; X86-NEXT:    movl %eax, 24(%edx) +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    movl %eax, 20(%edx) +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    movl %eax, 16(%edx) +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    movl %eax, 12(%edx) +; X86-NEXT:    movl %ebx, 8(%edx) +; X86-NEXT:    movl %edi, 4(%edx) +; X86-NEXT:    movl %esi, (%edx) +; X86-NEXT:    setae %al  ; X86-NEXT:    leal -12(%ebp), %esp  ; X86-NEXT:    popl %esi  ; X86-NEXT:    popl %edi @@ -3816,7 +1286,8 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; SSE-NEXT:    pushq %r13  ; SSE-NEXT:    pushq %r12  ; SSE-NEXT:    pushq %rbx -; SSE-NEXT:    subq $216, %rsp +; SSE-NEXT:    subq $184, %rsp +; SSE-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill  ; SSE-NEXT:    xorps %xmm0, %xmm0  ; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)  ; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) @@ -3829,139 +1300,103 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)  ; SSE-NEXT:    movl %esi, %ecx  ; SSE-NEXT:    andl $63, %ecx -; SSE-NEXT:    shrl $3, %esi -; SSE-NEXT:    andl $56, %esi -; SSE-NEXT:    negl %esi -; SSE-NEXT:    movslq %esi, %r10 -; SSE-NEXT:    movq 184(%rsp,%r10), %r11 -; SSE-NEXT:    movq 192(%rsp,%r10), %rsi -; SSE-NEXT:    movq %rsi, %r13 -; SSE-NEXT:    shldq %cl, %r11, %r13 -; SSE-NEXT:    movq 200(%rsp,%r10), %r15 -; SSE-NEXT:    shldq %cl, %rsi, %r15 -; SSE-NEXT:    movq 168(%rsp,%r10), %rbx -; SSE-NEXT:    movq 176(%rsp,%r10), %rsi -; SSE-NEXT:    movq %rsi, %r14 -; SSE-NEXT:    shldq %cl, %rbx, %r14 -; SSE-NEXT:    shldq %cl, %rsi, %r11 -; SSE-NEXT:    movq 152(%rsp,%r10), %rax -; SSE-NEXT:    movq 160(%rsp,%r10), %r8 -; SSE-NEXT:    movq %r8, %r12 -; SSE-NEXT:    shldq %cl, %rax, %r12 -; SSE-NEXT:    shldq %cl, %r8, %rbx -; SSE-NEXT:    movq 144(%rsp,%r10), %r9 -; SSE-NEXT:    movq %r9, %r8 -; SSE-NEXT:    shlq %cl, %r8 -; SSE-NEXT:    shldq %cl, %r9, %rax -; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movl %edx, %edx -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) +; SSE-NEXT:    movl %esi, %eax +; SSE-NEXT:    shrl $3, %eax +; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT:    # kill: def $eax killed $eax killed $rax +; SSE-NEXT:    andl $56, %eax +; SSE-NEXT:    negl %eax +; SSE-NEXT:    movslq %eax, %r12 +; SSE-NEXT:    movq 160(%rsp,%r12), %rax +; SSE-NEXT:    movq 168(%rsp,%r12), %r10 +; SSE-NEXT:    shldq %cl, %rax, %r10 +; SSE-NEXT:    movq 152(%rsp,%r12), %rsi +; SSE-NEXT:    shldq %cl, %rsi, %rax +; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT:    movq 144(%rsp,%r12), %r11 +; SSE-NEXT:    shldq %cl, %r11, %rsi +; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT:    movq 136(%rsp,%r12), %rbx +; SSE-NEXT:    shldq %cl, %rbx, %r11 +; SSE-NEXT:    movq 128(%rsp,%r12), %r14 +; SSE-NEXT:    shldq %cl, %r14, %rbx +; SSE-NEXT:    movq 120(%rsp,%r12), %r15 +; SSE-NEXT:    shldq %cl, %r15, %r14 +; SSE-NEXT:    movq 112(%rsp,%r12), %r13 +; SSE-NEXT:    shldq %cl, %r13, %r15 +; SSE-NEXT:    shlq %cl, %r13 +; SSE-NEXT:    movl %edx, %eax  ; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)  ; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, (%rsp) +; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp) +; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)  ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)  ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)  ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) -; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp) +; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)  ; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movq 16(%rdi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 48(%rdi), %rsi -; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rsi, %r13 -; SSE-NEXT:    andq %rdx, %r12 -; SSE-NEXT:    orq %r13, %r12 -; SSE-NEXT:    movq %r15, %rsi -; SSE-NEXT:    movq 56(%rdi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rdx, %r15 -; SSE-NEXT:    movq %rbx, %r13 -; SSE-NEXT:    movq 24(%rdi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rdx, %rbx -; SSE-NEXT:    orq %r15, %rbx -; SSE-NEXT:    movq %r14, %rbp -; SSE-NEXT:    movq 32(%rdi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rdx, %r14 -; SSE-NEXT:    movq %r8, %r15 -; SSE-NEXT:    movq (%rdi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rdx, %r8 -; SSE-NEXT:    orq %r14, %r8 -; SSE-NEXT:    orq %r12, %r8 -; SSE-NEXT:    movq %r11, %r12 -; SSE-NEXT:    movq 40(%rdi), %r9 -; SSE-NEXT:    andq %r9, %r11 -; SSE-NEXT:    movq %rax, %r14 -; SSE-NEXT:    movq 8(%rdi), %rdx +; SSE-NEXT:    movq 32(%rsp,%r12), %rax +; SSE-NEXT:    movq 40(%rsp,%r12), %rdx +; SSE-NEXT:    shldq %cl, %rax, %rdx  ; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq %rdx, %rax -; SSE-NEXT:    orq %r11, %rax -; SSE-NEXT:    orq %rbx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT:    movq 24(%rsp,%r12), %rdx +; SSE-NEXT:    shldq %cl, %rdx, %rax +; SSE-NEXT:    movq 16(%rsp,%r12), %rsi +; SSE-NEXT:    shldq %cl, %rsi, %rdx +; SSE-NEXT:    movq 8(%rsp,%r12), %r8 +; SSE-NEXT:    shldq %cl, %r8, %rsi +; SSE-NEXT:    movq (%rsp,%r12), %rbp +; SSE-NEXT:    shldq %cl, %rbp, %r8 +; SSE-NEXT:    movq -8(%rsp,%r12), %r9 +; SSE-NEXT:    shldq %cl, %r9, %rbp +; SSE-NEXT:    notq %r10 +; SSE-NEXT:    andq 56(%rdi), %r10 +; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload +; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE-NEXT:    notq %r10 +; SSE-NEXT:    andq 48(%rdi), %r10 +; SSE-NEXT:    orq %rax, %r10  ; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload  ; SSE-NEXT:    notq %rax -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; SSE-NEXT:    andq 40(%rdi), %rax +; SSE-NEXT:    orq %rdx, %rax  ; SSE-NEXT:    movq %rax, %rdx -; SSE-NEXT:    movq 56(%rsp,%r10), %r11 -; SSE-NEXT:    movq 64(%rsp,%r10), %rax -; SSE-NEXT:    movq %rax, %rbx -; SSE-NEXT:    shldq %cl, %r11, %rbx -; SSE-NEXT:    orq %rbx, %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    notq %rsi -; SSE-NEXT:    movq 72(%rsp,%r10), %rbx -; SSE-NEXT:    shldq %cl, %rax, %rbx -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; SSE-NEXT:    orq %rbx, %rsi -; SSE-NEXT:    notq %rbp -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; SSE-NEXT:    movq 40(%rsp,%r10), %rax -; SSE-NEXT:    movq 48(%rsp,%r10), %rdx -; SSE-NEXT:    movq %rdx, %rbx -; SSE-NEXT:    shldq %cl, %rax, %rbx -; SSE-NEXT:    orq %rbx, %rbp -; SSE-NEXT:    notq %r12 -; SSE-NEXT:    andq %r9, %r12 -; SSE-NEXT:    shldq %cl, %rdx, %r11 -; SSE-NEXT:    movq 24(%rsp,%r10), %r9 -; SSE-NEXT:    movq 32(%rsp,%r10), %rdx -; SSE-NEXT:    movq %rdx, %rbx -; SSE-NEXT:    shldq %cl, %r9, %rbx -; SSE-NEXT:    orq %r11, %r12 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload  ; SSE-NEXT:    notq %r11 -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    orq %rbx, %r11 -; SSE-NEXT:    notq %r13 -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload -; SSE-NEXT:    orq %rax, %r13 -; SSE-NEXT:    notq %r15 -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload -; SSE-NEXT:    movq 16(%rsp,%r10), %rax -; SSE-NEXT:    movq %rax, %rdx -; SSE-NEXT:    shlq %cl, %rdx -; SSE-NEXT:    orq %rdx, %r15 +; SSE-NEXT:    andq 32(%rdi), %r11 +; SSE-NEXT:    orq %rsi, %r11 +; SSE-NEXT:    notq %rbx +; SSE-NEXT:    andq 24(%rdi), %rbx +; SSE-NEXT:    orq %r8, %rbx  ; SSE-NEXT:    notq %r14 -; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload -; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx +; SSE-NEXT:    andq 16(%rdi), %r14 +; SSE-NEXT:    orq %rbp, %r14 +; SSE-NEXT:    notq %r15 +; SSE-NEXT:    movq -16(%rsp,%r12), %rax  ; SSE-NEXT:    shldq %cl, %rax, %r9 -; SSE-NEXT:    orq %r9, %r14 -; SSE-NEXT:    orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; SSE-NEXT:    andq 8(%rdi), %r15 +; SSE-NEXT:    orq %r9, %r15 +; SSE-NEXT:    notq %r13 +; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx +; SSE-NEXT:    shlq %cl, %rax +; SSE-NEXT:    andq (%rdi), %r13 +; SSE-NEXT:    orq %rax, %r13  ; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    movq %rax, 48(%rdi) -; SSE-NEXT:    movq %rsi, 56(%rdi) -; SSE-NEXT:    movq %rbp, 32(%rdi) -; SSE-NEXT:    movq %r12, 40(%rdi) -; SSE-NEXT:    movq %r11, 16(%rdi) -; SSE-NEXT:    movq %r13, 24(%rdi) -; SSE-NEXT:    movq %r15, (%rdi) -; SSE-NEXT:    movq %r14, 8(%rdi) -; SSE-NEXT:    sete %al -; SSE-NEXT:    addq $216, %rsp +; SSE-NEXT:    andl $60, %eax +; SSE-NEXT:    movl (%rdi,%rax), %eax +; SSE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; SSE-NEXT:    btl %ecx, %eax +; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; SSE-NEXT:    movq %rax, 56(%rdi) +; SSE-NEXT:    movq %r10, 48(%rdi) +; SSE-NEXT:    movq %rdx, 40(%rdi) +; SSE-NEXT:    movq %r11, 32(%rdi) +; SSE-NEXT:    movq %rbx, 24(%rdi) +; SSE-NEXT:    movq %r14, 16(%rdi) +; SSE-NEXT:    movq %r15, 8(%rdi) +; SSE-NEXT:    movq %r13, (%rdi) +; SSE-NEXT:    setae %al +; SSE-NEXT:    addq $184, %rsp  ; SSE-NEXT:    popq %rbx  ; SSE-NEXT:    popq %r12  ; SSE-NEXT:    popq %r13 @@ -3978,132 +1413,103 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; AVX2-NEXT:    pushq %r13  ; AVX2-NEXT:    pushq %r12  ; AVX2-NEXT:    pushq %rbx -; AVX2-NEXT:    subq $200, %rsp +; AVX2-NEXT:    subq $168, %rsp  ; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0  ; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)  ; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)  ; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)  ; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]  ; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    movl %esi, %r8d -; AVX2-NEXT:    andl $63, %r8d -; AVX2-NEXT:    shrl $3, %esi -; AVX2-NEXT:    andl $56, %esi -; AVX2-NEXT:    negl %esi -; AVX2-NEXT:    movslq %esi, %rsi -; AVX2-NEXT:    movq 144(%rsp,%rsi), %r11 -; AVX2-NEXT:    movq 152(%rsp,%rsi), %r12 -; AVX2-NEXT:    movq %r12, %r10 -; AVX2-NEXT:    movl %r8d, %ecx -; AVX2-NEXT:    shldq %cl, %r11, %r10 -; AVX2-NEXT:    movq 176(%rsp,%rsi), %r14 -; AVX2-NEXT:    movq 184(%rsp,%rsi), %r9 -; AVX2-NEXT:    shldq %cl, %r14, %r9 -; AVX2-NEXT:    movq 160(%rsp,%rsi), %r15 -; AVX2-NEXT:    movq 168(%rsp,%rsi), %r13 -; AVX2-NEXT:    movq %r13, %rbx -; AVX2-NEXT:    shldq %cl, %r15, %rbx -; AVX2-NEXT:    movq 128(%rsp,%rsi), %rbp -; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 136(%rsp,%rsi), %rax -; AVX2-NEXT:    shldq %cl, %rax, %r11 -; AVX2-NEXT:    shldq %cl, %r13, %r14 -; AVX2-NEXT:    shldq %cl, %r12, %r15 -; AVX2-NEXT:    shldq %cl, %rbp, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movl %edx, %edx +; AVX2-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX2-NEXT:    movl %esi, %ecx +; AVX2-NEXT:    andl $63, %ecx +; AVX2-NEXT:    movl %esi, %r11d +; AVX2-NEXT:    shrl $3, %r11d +; AVX2-NEXT:    movl %r11d, %eax +; AVX2-NEXT:    andl $56, %eax +; AVX2-NEXT:    negl %eax +; AVX2-NEXT:    movslq %eax, %r10 +; AVX2-NEXT:    movq 104(%rsp,%r10), %r15 +; AVX2-NEXT:    movq 112(%rsp,%r10), %rax +; AVX2-NEXT:    movq %rax, %rsi +; AVX2-NEXT:    shldq %cl, %r15, %rsi +; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT:    movq 120(%rsp,%r10), %rsi +; AVX2-NEXT:    movq %rsi, %r8 +; AVX2-NEXT:    shldq %cl, %rax, %r8 +; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT:    movq 128(%rsp,%r10), %rax +; AVX2-NEXT:    movq %rax, %rbx +; AVX2-NEXT:    shldq %cl, %rsi, %rbx +; AVX2-NEXT:    movq 136(%rsp,%r10), %rsi +; AVX2-NEXT:    movq %rsi, %r14 +; AVX2-NEXT:    shldq %cl, %rax, %r14 +; AVX2-NEXT:    movq 144(%rsp,%r10), %rax +; AVX2-NEXT:    movq %rax, %r12 +; AVX2-NEXT:    shldq %cl, %rsi, %r12 +; AVX2-NEXT:    movq 96(%rsp,%r10), %rsi +; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT:    movq 152(%rsp,%r10), %r13 +; AVX2-NEXT:    shldq %cl, %rax, %r13 +; AVX2-NEXT:    shldq %cl, %rsi, %r15 +; AVX2-NEXT:    movl %edx, %eax  ; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1  ; AVX2-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)  ; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)  ; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX2-NEXT:    movq %rdx, (%rsp) +; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) +; AVX2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)  ; AVX2-NEXT:    movq $0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    movq 16(%rdi), %r12 -; AVX2-NEXT:    movq 48(%rdi), %rbp -; AVX2-NEXT:    movq 32(%rdi), %r13 -; AVX2-NEXT:    andnq %r13, %r15, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r15, %r13 -; AVX2-NEXT:    andnq %rbp, %r14, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r14, %rbp -; AVX2-NEXT:    andnq %r12, %r11, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r11, %r12 -; AVX2-NEXT:    movq 40(%rdi), %rax +; AVX2-NEXT:    movq 16(%rsp,%r10), %rbp +; AVX2-NEXT:    movq 24(%rsp,%r10), %r9 +; AVX2-NEXT:    shldq %cl, %rbp, %r9 +; AVX2-NEXT:    movq 8(%rsp,%r10), %rdx +; AVX2-NEXT:    shldq %cl, %rdx, %rbp +; AVX2-NEXT:    movq (%rsp,%r10), %rax +; AVX2-NEXT:    shldq %cl, %rax, %rdx +; AVX2-NEXT:    movq -8(%rsp,%r10), %r8 +; AVX2-NEXT:    shldq %cl, %r8, %rax +; AVX2-NEXT:    movq -16(%rsp,%r10), %rsi +; AVX2-NEXT:    shldq %cl, %rsi, %r8 +; AVX2-NEXT:    andnq 56(%rdi), %r13, %r13 +; AVX2-NEXT:    orq %r9, %r13 +; AVX2-NEXT:    movq -24(%rsp,%r10), %r9 +; AVX2-NEXT:    shldq %cl, %r9, %rsi +; AVX2-NEXT:    andnq 48(%rdi), %r12, %r12 +; AVX2-NEXT:    andnq 40(%rdi), %r14, %r14  ; AVX2-NEXT:    orq %rbp, %r12 -; AVX2-NEXT:    andnq %rax, %rbx, %rcx -; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq %rax, %rbp -; AVX2-NEXT:    andq %rbx, %rbp -; AVX2-NEXT:    movq 56(%rdi), %rcx -; AVX2-NEXT:    andnq %rcx, %r9, %rbx -; AVX2-NEXT:    andq %r9, %rcx -; AVX2-NEXT:    movq 24(%rdi), %rax -; AVX2-NEXT:    andnq %rax, %r10, %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq %r10, %rax -; AVX2-NEXT:    orq %rcx, %rax -; AVX2-NEXT:    shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload -; AVX2-NEXT:    movq (%rdi), %r10 -; AVX2-NEXT:    andnq %r10, %rcx, %r15 -; AVX2-NEXT:    andq %rcx, %r10 -; AVX2-NEXT:    movq 40(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq 48(%rsp,%rsi), %r11 -; AVX2-NEXT:    movq %r11, %r9 -; AVX2-NEXT:    movl %r8d, %ecx -; AVX2-NEXT:    shldq %cl, %rdx, %r9 -; AVX2-NEXT:    orq %r13, %r10 -; AVX2-NEXT:    orq %r12, %r10 -; AVX2-NEXT:    movq 8(%rdi), %r13 +; AVX2-NEXT:    orq %rdx, %r14 +; AVX2-NEXT:    andnq 32(%rdi), %rbx, %rdx +; AVX2-NEXT:    orq %rax, %rdx +; AVX2-NEXT:    shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX2-NEXT:    movq -32(%rsp,%r10), %r10 +; AVX2-NEXT:    shlxq %rcx, %r10, %rbx +; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx +; AVX2-NEXT:    shldq %cl, %r10, %r9  ; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT:    andnq %r13, %rcx, %r12 -; AVX2-NEXT:    andq %rcx, %r13 -; AVX2-NEXT:    orq %rbp, %r13 -; AVX2-NEXT:    orq %rax, %r13 -; AVX2-NEXT:    movq 56(%rsp,%rsi), %rax -; AVX2-NEXT:    movl %r8d, %ecx -; AVX2-NEXT:    shldq %cl, %r11, %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    orq %r9, %r14 -; AVX2-NEXT:    orq %rax, %rbx -; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 24(%rsp,%rsi), %rax -; AVX2-NEXT:    movq 32(%rsp,%rsi), %r9 -; AVX2-NEXT:    movq %r9, %r11 -; AVX2-NEXT:    shldq %cl, %rax, %r11 -; AVX2-NEXT:    shldq %cl, %r9, %rdx -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT:    orq %r11, %rbp -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT:    orq %rdx, %rbx -; AVX2-NEXT:    movq 8(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq 16(%rsp,%rsi), %r9 -; AVX2-NEXT:    movq %r9, %r11 -; AVX2-NEXT:    shldq %cl, %rdx, %r11 -; AVX2-NEXT:    shldq %cl, %r9, %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT:    orq %r11, %r9 -; AVX2-NEXT:    movq (%rsp,%rsi), %rsi -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT:    orq %rax, %r11 -; AVX2-NEXT:    shlxq %r8, %rsi, %rax -; AVX2-NEXT:    shldq %cl, %rsi, %rdx -; AVX2-NEXT:    orq %rax, %r15 -; AVX2-NEXT:    orq %rdx, %r12 -; AVX2-NEXT:    orq %r10, %r13 -; AVX2-NEXT:    movq %r14, 48(%rdi) -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    movq %rax, 56(%rdi) -; AVX2-NEXT:    movq %rbp, 32(%rdi) -; AVX2-NEXT:    movq %rbx, 40(%rdi) -; AVX2-NEXT:    movq %r9, 16(%rdi) -; AVX2-NEXT:    movq %r11, 24(%rdi) -; AVX2-NEXT:    movq %r15, (%rdi) -; AVX2-NEXT:    movq %r12, 8(%rdi) -; AVX2-NEXT:    sete %al -; AVX2-NEXT:    addq $200, %rsp +; AVX2-NEXT:    andnq 24(%rdi), %rcx, %rcx +; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX2-NEXT:    andnq 16(%rdi), %r10, %r10 +; AVX2-NEXT:    orq %r8, %rcx +; AVX2-NEXT:    orq %rsi, %r10 +; AVX2-NEXT:    andnq 8(%rdi), %r15, %rsi +; AVX2-NEXT:    orq %r9, %rsi +; AVX2-NEXT:    andnq (%rdi), %rax, %rax +; AVX2-NEXT:    orq %rbx, %rax +; AVX2-NEXT:    andl $60, %r11d +; AVX2-NEXT:    movl (%rdi,%r11), %r8d +; AVX2-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload +; AVX2-NEXT:    btl %r9d, %r8d +; AVX2-NEXT:    movq %r13, 56(%rdi) +; AVX2-NEXT:    movq %r12, 48(%rdi) +; AVX2-NEXT:    movq %r14, 40(%rdi) +; AVX2-NEXT:    movq %rdx, 32(%rdi) +; AVX2-NEXT:    movq %rcx, 24(%rdi) +; AVX2-NEXT:    movq %r10, 16(%rdi) +; AVX2-NEXT:    movq %rsi, 8(%rdi) +; AVX2-NEXT:    movq %rax, (%rdi) +; AVX2-NEXT:    setae %al +; AVX2-NEXT:    addq $168, %rsp  ; AVX2-NEXT:    popq %rbx  ; AVX2-NEXT:    popq %r12  ; AVX2-NEXT:    popq %r13 @@ -4121,131 +1527,100 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {  ; AVX512-NEXT:    pushq %r13  ; AVX512-NEXT:    pushq %r12  ; AVX512-NEXT:    pushq %rbx -; AVX512-NEXT:    subq $184, %rsp +; AVX512-NEXT:    subq $152, %rsp  ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0  ; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)  ; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)  ; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)  ; AVX512-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]  ; AVX512-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill  ; AVX512-NEXT:    movl %esi, %ecx  ; AVX512-NEXT:    andl $63, %ecx -; AVX512-NEXT:    shrl $3, %esi -; AVX512-NEXT:    andl $56, %esi -; AVX512-NEXT:    negl %esi -; AVX512-NEXT:    movslq %esi, %rsi -; AVX512-NEXT:    movq 128(%rsp,%rsi), %r10 -; AVX512-NEXT:    movq 136(%rsp,%rsi), %r12 -; AVX512-NEXT:    movq %r12, %rax -; AVX512-NEXT:    shldq %cl, %r10, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 160(%rsp,%rsi), %r14 -; AVX512-NEXT:    movq 168(%rsp,%rsi), %rax -; AVX512-NEXT:    shldq %cl, %r14, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 144(%rsp,%rsi), %r15 -; AVX512-NEXT:    movq 152(%rsp,%rsi), %r11 -; AVX512-NEXT:    movq %r11, %rbx -; AVX512-NEXT:    shldq %cl, %r15, %rbx -; AVX512-NEXT:    movq 120(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rax, %r10 -; AVX512-NEXT:    shldq %cl, %r11, %r14 -; AVX512-NEXT:    movq %rdi, %r9 -; AVX512-NEXT:    movq 112(%rsp,%rsi), %r11 -; AVX512-NEXT:    shldq %cl, %r12, %r15 -; AVX512-NEXT:    movl %edx, %edx +; AVX512-NEXT:    movl %esi, %r8d +; AVX512-NEXT:    shrl $3, %r8d +; AVX512-NEXT:    movl %r8d, %eax +; AVX512-NEXT:    andl $56, %eax +; AVX512-NEXT:    negl %eax +; AVX512-NEXT:    movslq %eax, %r9 +; AVX512-NEXT:    movq 88(%rsp,%r9), %r10 +; AVX512-NEXT:    movq 96(%rsp,%r9), %rax +; AVX512-NEXT:    movq %rax, %rsi +; AVX512-NEXT:    shldq %cl, %r10, %rsi +; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT:    movq 104(%rsp,%r9), %rsi +; AVX512-NEXT:    movq %rsi, %r11 +; AVX512-NEXT:    shldq %cl, %rax, %r11 +; AVX512-NEXT:    movq 112(%rsp,%r9), %rax +; AVX512-NEXT:    movq %rax, %rbx +; AVX512-NEXT:    shldq %cl, %rsi, %rbx +; AVX512-NEXT:    movq 120(%rsp,%r9), %rsi +; AVX512-NEXT:    movq %rsi, %r14 +; AVX512-NEXT:    shldq %cl, %rax, %r14 +; AVX512-NEXT:    movq 128(%rsp,%r9), %rax +; AVX512-NEXT:    movq %rax, %r12 +; AVX512-NEXT:    shldq %cl, %rsi, %r12 +; AVX512-NEXT:    movq 136(%rsp,%r9), %r13 +; AVX512-NEXT:    shldq %cl, %rax, %r13 +; AVX512-NEXT:    movq 80(%rsp,%r9), %r15 +; AVX512-NEXT:    shldq %cl, %r15, %r10 +; AVX512-NEXT:    movl %edx, %eax  ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 -; AVX512-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp) +; AVX512-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)  ; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)  ; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)  ; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) -; AVX512-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp) +; AVX512-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)  ; AVX512-NEXT:    movq $0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    movq 16(%rdi), %r12 -; AVX512-NEXT:    movq 48(%rdi), %r13 -; AVX512-NEXT:    movq 32(%rdi), %rbp -; AVX512-NEXT:    andnq %rbp, %r15, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %r15, %rbp -; AVX512-NEXT:    andnq %r13, %r14, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %r14, %r13 -; AVX512-NEXT:    andnq %r12, %r10, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq %r10, %r12 -; AVX512-NEXT:    movq 40(%rdi), %r8 -; AVX512-NEXT:    orq %r13, %r12 -; AVX512-NEXT:    andnq %r8, %rbx, %rdi -; AVX512-NEXT:    andq %rbx, %r8 -; AVX512-NEXT:    movq 56(%r9), %r13 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT:    andnq %r13, %rdx, %r10 -; AVX512-NEXT:    andq %rdx, %r13 -; AVX512-NEXT:    movq 24(%r9), %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT:    andnq %rax, %rdx, %r15 -; AVX512-NEXT:    andq %rdx, %rax -; AVX512-NEXT:    orq %r13, %rax -; AVX512-NEXT:    shlxq %rcx, %r11, %r13 -; AVX512-NEXT:    movq (%r9), %rdx -; AVX512-NEXT:    andnq %rdx, %r13, %r14 -; AVX512-NEXT:    andq %r13, %rdx -; AVX512-NEXT:    orq %rbp, %rdx -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r11, %rbp -; AVX512-NEXT:    orq %r12, %rdx -; AVX512-NEXT:    movq 8(%r9), %r13 -; AVX512-NEXT:    andnq %r13, %rbp, %rbx -; AVX512-NEXT:    andq %rbp, %r13 -; AVX512-NEXT:    orq %r8, %r13 -; AVX512-NEXT:    movq 24(%rsp,%rsi), %r8 -; AVX512-NEXT:    orq %rax, %r13 -; AVX512-NEXT:    movq 32(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, %r12 -; AVX512-NEXT:    shldq %cl, %r8, %r12 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT:    orq %r12, %r11 -; AVX512-NEXT:    movq 40(%rsp,%rsi), %r12 -; AVX512-NEXT:    shldq %cl, %rax, %r12 -; AVX512-NEXT:    orq %r12, %r10 -; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 8(%rsp,%rsi), %rax -; AVX512-NEXT:    movq 16(%rsp,%rsi), %r12 -; AVX512-NEXT:    movq %r12, %rbp -; AVX512-NEXT:    shldq %cl, %rax, %rbp -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT:    orq %rbp, %r10 -; AVX512-NEXT:    shldq %cl, %r12, %r8 -; AVX512-NEXT:    orq %r8, %rdi -; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq -8(%rsp,%rsi), %r8 -; AVX512-NEXT:    movq (%rsp,%rsi), %r12 -; AVX512-NEXT:    movq %r12, %rbp -; AVX512-NEXT:    shldq %cl, %r8, %rbp -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT:    orq %rbp, %rdi -; AVX512-NEXT:    movq -16(%rsp,%rsi), %rsi -; AVX512-NEXT:    shldq %cl, %r12, %rax +; AVX512-NEXT:    movq (%rsp,%r9), %rbp +; AVX512-NEXT:    movq 8(%rsp,%r9), %rsi +; AVX512-NEXT:    shldq %cl, %rbp, %rsi +; AVX512-NEXT:    movq -8(%rsp,%r9), %rdx +; AVX512-NEXT:    shldq %cl, %rdx, %rbp +; AVX512-NEXT:    movq -16(%rsp,%r9), %rax +; AVX512-NEXT:    shldq %cl, %rax, %rdx +; AVX512-NEXT:    andnq 56(%rdi), %r13, %r13 +; AVX512-NEXT:    andnq 48(%rdi), %r12, %r12 +; AVX512-NEXT:    orq %rsi, %r13 +; AVX512-NEXT:    orq %rbp, %r12 +; AVX512-NEXT:    andnq 40(%rdi), %r14, %r14 +; AVX512-NEXT:    orq %rdx, %r14 +; AVX512-NEXT:    movq -24(%rsp,%r9), %rsi +; AVX512-NEXT:    shldq %cl, %rsi, %rax +; AVX512-NEXT:    andnq 32(%rdi), %rbx, %rdx +; AVX512-NEXT:    orq %rax, %rdx +; AVX512-NEXT:    movq -32(%rsp,%r9), %rax +; AVX512-NEXT:    shldq %cl, %rax, %rsi +; AVX512-NEXT:    shlxq %rcx, %r15, %rbx +; AVX512-NEXT:    andnq 24(%rdi), %r11, %r11 +; AVX512-NEXT:    orq %rsi, %r11 +; AVX512-NEXT:    movq -48(%rsp,%r9), %rsi +; AVX512-NEXT:    movq -40(%rsp,%r9), %r9 +; AVX512-NEXT:    shldq %cl, %r9, %rax +; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT:    andnq 16(%rdi), %r15, %r15  ; AVX512-NEXT:    orq %rax, %r15  ; AVX512-NEXT:    shlxq %rcx, %rsi, %rax  ; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT:    shldq %cl, %rsi, %r8 -; AVX512-NEXT:    orq %rax, %r14 -; AVX512-NEXT:    orq %r8, %rbx -; AVX512-NEXT:    orq %rdx, %r13 -; AVX512-NEXT:    movq %r11, 48(%r9) -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    movq %rax, 56(%r9) -; AVX512-NEXT:    movq %r10, 32(%r9) -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    movq %rax, 40(%r9) -; AVX512-NEXT:    movq %rdi, 16(%r9) -; AVX512-NEXT:    movq %r15, 24(%r9) -; AVX512-NEXT:    movq %r14, (%r9) -; AVX512-NEXT:    movq %rbx, 8(%r9) -; AVX512-NEXT:    sete %al -; AVX512-NEXT:    addq $184, %rsp +; AVX512-NEXT:    shldq %cl, %rsi, %r9 +; AVX512-NEXT:    andnq 8(%rdi), %r10, %rcx +; AVX512-NEXT:    orq %r9, %rcx +; AVX512-NEXT:    andnq (%rdi), %rbx, %rsi +; AVX512-NEXT:    orq %rax, %rsi +; AVX512-NEXT:    andl $60, %r8d +; AVX512-NEXT:    movl (%rdi,%r8), %eax +; AVX512-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 4-byte Reload +; AVX512-NEXT:    btl %r8d, %eax +; AVX512-NEXT:    movq %r13, 56(%rdi) +; AVX512-NEXT:    movq %r12, 48(%rdi) +; AVX512-NEXT:    movq %r14, 40(%rdi) +; AVX512-NEXT:    movq %rdx, 32(%rdi) +; AVX512-NEXT:    movq %r11, 24(%rdi) +; AVX512-NEXT:    movq %r15, 16(%rdi) +; AVX512-NEXT:    movq %rcx, 8(%rdi) +; AVX512-NEXT:    movq %rsi, (%rdi) +; AVX512-NEXT:    setae %al +; AVX512-NEXT:    addq $152, %rsp  ; AVX512-NEXT:    popq %rbx  ; AVX512-NEXT:    popq %r12  ; AVX512-NEXT:    popq %r13 @@ -4274,144 +1649,48 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {  define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {  ; X86-LABEL: test_ne_i4096:  ; X86:       # %bb.0: +; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax +; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT:    movl %ecx, %edx +; X86-NEXT:    andl $4064, %edx # imm = 0xFE0 +; X86-NEXT:    shrl $3, %edx +; X86-NEXT:    movl (%eax,%edx), %eax +; X86-NEXT:    btl %ecx, %eax +; X86-NEXT:    setb %al +; X86-NEXT:    retl +; +; X64-LABEL: test_ne_i4096: +; X64:       # %bb.0: +; X64-NEXT:    movl %esi, %eax +; X64-NEXT:    andl $4064, %eax # imm = 0xFE0 +; X64-NEXT:    shrl $3, %eax +; X64-NEXT:    movl (%rdi,%rax), %eax +; X64-NEXT:    btl %esi, %eax +; X64-NEXT:    setb %al +; X64-NEXT:    retq +  %rem = and i32 %position, 4095 +  %ofs = zext nneg i32 %rem to i4096 +  %bit = shl nuw i4096 1, %ofs +  %ld = load i4096, ptr %word +  %test = and i4096 %ld, %bit +  %cmp = icmp ne i4096 %test, 0 +  ret i1 %cmp +} + +; Special Cases + +; Multiple uses of the stored value +define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind { +; X86-LABEL: complement_cmpz_i128: +; X86:       # %bb.0:  ; X86-NEXT:    pushl %ebp  ; X86-NEXT:    movl %esp, %ebp  ; X86-NEXT:    pushl %ebx  ; X86-NEXT:    pushl %edi  ; X86-NEXT:    pushl %esi  ; X86-NEXT:    andl $-16, %esp -; X86-NEXT:    subl $1792, %esp # imm = 0x700 -; X86-NEXT:    movl 12(%ebp), %ebx -; X86-NEXT:    movl %ebx, %ecx -; X86-NEXT:    shrl $3, %ecx -; X86-NEXT:    andl $508, %ecx # imm = 0x1FC -; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    subl %ecx, %esi -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) +; X86-NEXT:    subl $64, %esp +; X86-NEXT:    movzbl 12(%ebp), %ecx  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) @@ -4420,1061 +1699,35 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)  ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) -; X86-NEXT:    movl 248(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 252(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    andl $31, %ebx -; X86-NEXT:    movl %ebx, %ecx -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 504(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 508(%esi), %edx -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 120(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 124(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 376(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 380(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 184(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 188(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 440(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 444(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 56(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 60(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 312(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 316(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 216(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 220(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 472(%esi), %edi -; X86-NEXT:    movl 476(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edi, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 88(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 92(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 344(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 348(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 152(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 156(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 408(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 412(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 24(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 28(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 280(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 284(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 232(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 236(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 488(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 492(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 104(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 108(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 360(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 364(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 168(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 172(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 424(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 428(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 40(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 44(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 296(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 300(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 200(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 204(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 456(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 460(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 72(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 76(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 328(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 332(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 136(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 140(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 392(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 396(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 12(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 264(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 268(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 240(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 244(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 496(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 500(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 112(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 116(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 368(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 372(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 176(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 180(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 432(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 436(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 48(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 52(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 304(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 308(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 208(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 212(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 464(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 468(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 80(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 84(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 336(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 340(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 144(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 148(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 400(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 404(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 16(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 20(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 272(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 276(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 224(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 228(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 480(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 484(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 96(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 100(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 352(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 356(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 160(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 164(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 416(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 420(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 32(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 36(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 288(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 292(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 192(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 196(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 448(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 452(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 64(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 68(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 320(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 324(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 128(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 132(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl %ecx, %eax +; X86-NEXT:    shrb $3, %al +; X86-NEXT:    andb $12, %al +; X86-NEXT:    negb %al +; X86-NEXT:    movsbl %al, %esi +; X86-NEXT:    movl 36(%esp,%esi), %eax +; X86-NEXT:    movl 40(%esp,%esi), %edi  ; X86-NEXT:    movl %edi, %edx -; X86-NEXT:    movl 256(%esi), %edi -; X86-NEXT:    movl 260(%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edi, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    shldl %cl, %edi, %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload  ; X86-NEXT:    shldl %cl, %eax, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl 388(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl 4(%esi), %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shrdl $1, %eax, %edi -; X86-NEXT:    shrl %eax -; X86-NEXT:    movl %ebx, %edx -; X86-NEXT:    movl %eax, %ebx -; X86-NEXT:    notb %cl -; X86-NEXT:    shrdl %cl, %eax, %edi -; X86-NEXT:    shrl %cl, %ebx -; X86-NEXT:    movb $32, %cl -; X86-NEXT:    testb %cl, %cl -; X86-NEXT:    movl (%esi), %eax -; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl 8(%ebp), %eax -; X86-NEXT:    jne .LBB20_2 -; X86-NEXT:  # %bb.1: -; X86-NEXT:    movl %edi, %ebx -; X86-NEXT:  .LBB20_2: -; X86-NEXT:    movl %edx, %ecx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shll %cl, %edx -; X86-NEXT:    orl %ebx, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 320(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 64(%eax), %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 448(%eax), %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 192(%eax), %ecx -; X86-NEXT:    orl %edx, %ecx -; X86-NEXT:    orl %esi, %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 288(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 32(%eax), %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 416(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 160(%eax), %edi -; X86-NEXT:    orl %ecx, %edi -; X86-NEXT:    orl %edx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 352(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 96(%eax), %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 480(%eax), %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 224(%eax), %ecx -; X86-NEXT:    orl %edx, %ecx -; X86-NEXT:    orl %esi, %ecx -; X86-NEXT:    orl %edi, %ecx -; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 272(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 16(%eax), %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 400(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 144(%eax), %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    orl %edx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 336(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 80(%eax), %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 464(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 208(%eax), %edi -; X86-NEXT:    orl %ecx, %edi -; X86-NEXT:    orl %edx, %edi -; X86-NEXT:    orl %esi, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 304(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 48(%eax), %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 432(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 176(%eax), %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    orl %edx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 368(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 112(%eax), %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 496(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    andl 240(%eax), %ebx -; X86-NEXT:    orl %ecx, %ebx -; X86-NEXT:    orl %edx, %ebx -; X86-NEXT:    orl %esi, %ebx -; X86-NEXT:    orl %edi, %ebx +; X86-NEXT:    movl 32(%esp,%esi), %ebx  ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 264(%eax), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 8(%eax), %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl %eax, %ebx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 392(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 136(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    orl %edx, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 328(%ebx), %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 72(%ebx), %eax -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 456(%ebx), %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 200(%ebx), %esi -; X86-NEXT:    orl %edi, %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 296(%ebx), %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 40(%ebx), %eax -; X86-NEXT:    orl %edi, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 424(%ebx), %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 168(%ebx), %edx -; X86-NEXT:    orl %edi, %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 360(%ebx), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 104(%ebx), %eax -; X86-NEXT:    orl %ecx, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 488(%ebx), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 232(%ebx), %edi -; X86-NEXT:    orl %ecx, %edi -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    orl %edx, %edi -; X86-NEXT:    orl %esi, %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 280(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 24(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 408(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 152(%ebx), %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 344(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 88(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 472(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 216(%ebx), %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    orl %edx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 312(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 56(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 440(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 184(%ebx), %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 376(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 120(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 504(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 248(%ebx), %edi -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    orl %ecx, %edi -; X86-NEXT:    orl %edx, %edi -; X86-NEXT:    orl %esi, %edi -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 324(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 68(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 452(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 196(%ebx), %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 292(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 36(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 420(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 164(%ebx), %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 356(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 100(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 484(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 228(%ebx), %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    orl %edx, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 276(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 20(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 404(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 148(%ebx), %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 340(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 84(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 468(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 212(%ebx), %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    orl %edx, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 308(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 52(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 436(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 180(%ebx), %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 372(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 116(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 500(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 244(%ebx), %edi -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    orl %ecx, %edi -; X86-NEXT:    orl %edx, %edi -; X86-NEXT:    orl %esi, %edi -; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 268(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 12(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 396(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 140(%ebx), %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 332(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 76(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 460(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 204(%ebx), %edi -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    orl %ecx, %edi -; X86-NEXT:    orl %edx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 300(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 44(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 428(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 172(%ebx), %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 364(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 108(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 492(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT:    andl 236(%ebx), %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    orl %edx, %esi -; X86-NEXT:    orl %edi, %esi -; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 284(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 28(%ebx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 412(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 156(%ebx), %edi -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    orl %ecx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 348(%ebx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 92(%ebx), %edx -; X86-NEXT:    orl %eax, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 476(%ebx), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 220(%ebx), %eax -; X86-NEXT:    orl %ecx, %eax -; X86-NEXT:    orl %edx, %eax -; X86-NEXT:    orl %edi, %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 316(%ebx), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 60(%ebx), %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 444(%ebx), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT:    andl 188(%ebx), %edi -; X86-NEXT:    orl %ecx, %edi -; X86-NEXT:    orl %edx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 380(%ebx), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    andl 124(%ebx), %edx -; X86-NEXT:    orl %ecx, %edx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 508(%ebx), %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT:    movl 8(%ebp), %esi -; X86-NEXT:    andl 252(%esi), %ebx -; X86-NEXT:    orl %ecx, %ebx -; X86-NEXT:    orl %edx, %ebx -; X86-NEXT:    orl %edi, %ebx -; X86-NEXT:    orl %eax, %ebx -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    negl %ecx -; X86-NEXT:    movl 1648(%esp,%ecx), %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT:    movl 44(%esp,%esi), %esi  ; X86-NEXT:    shldl %cl, %edi, %esi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT:    shldl %cl, %edx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    movl 8(%ebp), %edx -; X86-NEXT:    andl 128(%edx), %ecx -; X86-NEXT:    andl 384(%edx), %edi -; X86-NEXT:    orl %ecx, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    # kill: def $cl killed $cl killed $ecx -; X86-NEXT:    shll %cl, %eax -; X86-NEXT:    andl (%edx), %eax -; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 256(%edx), %eax +; X86-NEXT:    movl %ebx, %edi +; X86-NEXT:    shll %cl, %edi +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    shldl %cl, %ebx, %eax +; X86-NEXT:    movl 8(%ebp), %ecx +; X86-NEXT:    xorl 12(%ecx), %esi +; X86-NEXT:    xorl 8(%ecx), %edx +; X86-NEXT:    xorl 4(%ecx), %eax +; X86-NEXT:    xorl (%ecx), %edi +; X86-NEXT:    movl %edx, 8(%ecx) +; X86-NEXT:    movl %esi, 12(%ecx) +; X86-NEXT:    movl %edi, (%ecx) +; X86-NEXT:    movl %eax, 4(%ecx) +; X86-NEXT:    orl %esi, %eax +; X86-NEXT:    orl %edx, %edi  ; X86-NEXT:    orl %eax, %edi -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 260(%edx), %eax -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT:    andl 4(%edx), %ecx -; X86-NEXT:    orl %eax, %ecx -; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT:    andl 132(%edx), %eax -; X86-NEXT:    andl 388(%edx), %esi -; X86-NEXT:    orl %eax, %esi -; X86-NEXT:    orl %ecx, %esi -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT:    orl %ebx, %esi -; X86-NEXT:    orl %edi, %esi  ; X86-NEXT:    setne %al  ; X86-NEXT:    leal -12(%ebp), %esp  ; X86-NEXT:    popl %esi @@ -5483,1545 +1736,231 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {  ; X86-NEXT:    popl %ebp  ; X86-NEXT:    retl  ; -; SSE-LABEL: test_ne_i4096: +; SSE-LABEL: complement_cmpz_i128:  ; SSE:       # %bb.0: -; SSE-NEXT:    pushq %rbp -; SSE-NEXT:    pushq %r15 -; SSE-NEXT:    pushq %r14 -; SSE-NEXT:    pushq %r13 -; SSE-NEXT:    pushq %r12 -; SSE-NEXT:    pushq %rbx -; SSE-NEXT:    subq $1576, %rsp # imm = 0x628  ; SSE-NEXT:    movl %esi, %ecx -; SSE-NEXT:    movl %esi, %eax -; SSE-NEXT:    andl $4032, %eax # imm = 0xFC0 -; SSE-NEXT:    xorps %xmm0, %xmm0 -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp) -; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp) -; SSE-NEXT:    andl $63, %ecx -; SSE-NEXT:    shrl $3, %eax -; SSE-NEXT:    negl %eax -; SSE-NEXT:    movslq %eax, %rsi -; SSE-NEXT:    movq 1296(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1304(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1552(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1560(%rsp,%rsi), %rax -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1168(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1176(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1424(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1432(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1232(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1240(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1488(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1496(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1104(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1112(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1360(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, (%rsp) # 8-byte Spill -; SSE-NEXT:    movq 1368(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1264(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1272(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1520(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1528(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1136(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1144(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1392(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1400(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1200(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1208(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1456(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1464(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1072(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1080(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1328(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1336(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1280(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1288(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1536(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1544(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1152(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1160(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1408(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1416(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1216(%rsp,%rsi), %r11 -; SSE-NEXT:    movq 1224(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %r11, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1472(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1480(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1088(%rsp,%rsi), %r9 -; SSE-NEXT:    movq 1096(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %r9, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1344(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1352(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1248(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1256(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rax, %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1504(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1512(%rsp,%rsi), %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rdx, %rax -; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1120(%rsp,%rsi), %rax -; SSE-NEXT:    movq 1128(%rsp,%rsi), %r8 -; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    shldq %cl, %rax, %r8 -; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1376(%rsp,%rsi), %r13 -; SSE-NEXT:    movq 1384(%rsp,%rsi), %rbx -; SSE-NEXT:    movq %rbx, %r8 -; SSE-NEXT:    shldq %cl, %r13, %r8 -; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1184(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1192(%rsp,%rsi), %r15 -; SSE-NEXT:    movq %r15, %r14 -; SSE-NEXT:    shldq %cl, %rdx, %r14 -; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1440(%rsp,%rsi), %r10 -; SSE-NEXT:    movq 1448(%rsp,%rsi), %rdx -; SSE-NEXT:    movq %rdx, %r14 -; SSE-NEXT:    shldq %cl, %r10, %r14 -; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1312(%rsp,%rsi), %r14 -; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq 1320(%rsp,%rsi), %rbp -; SSE-NEXT:    movq %rbp, %r12 -; SSE-NEXT:    shldq %cl, %r14, %r12 -; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, (%rsp) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq 1064(%rsp,%rsi), %rbx -; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %rbp, %r14 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    shldq %cl, %rdx, %r11 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r15, %rdx -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r15, %r9 -; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r15, %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r15, %rbp -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r15, %r9 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r15, %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r15, %r13 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r12, %r15 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT:    shldq %cl, %r12, %r10 -; SSE-NEXT:    andq 384(%rdi), %r10 -; SSE-NEXT:    andq 128(%rdi), %r15 -; SSE-NEXT:    andq 320(%rdi), %r13 -; SSE-NEXT:    andq 64(%rdi), %rax -; SSE-NEXT:    orq %r10, %r15 -; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    orq %r13, %rax -; SSE-NEXT:    andq 448(%rdi), %r9 -; SSE-NEXT:    andq 192(%rdi), %rbp -; SSE-NEXT:    orq %r9, %rbp -; SSE-NEXT:    orq %rax, %rbp -; SSE-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    andq 288(%rdi), %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT:    andq 32(%rdi), %r9 -; SSE-NEXT:    andq 416(%rdi), %rdx -; SSE-NEXT:    andq 160(%rdi), %r11 -; SSE-NEXT:    orq %r8, %r9 -; SSE-NEXT:    orq %rdx, %r11 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    andq 352(%rdi), %rdx -; SSE-NEXT:    orq %r9, %r11 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 96(%rdi), %rax -; SSE-NEXT:    orq %rdx, %rax -; SSE-NEXT:    movq %rax, %rdx -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 480(%rdi), %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 224(%rdi), %r8 -; SSE-NEXT:    orq %rax, %r8 -; SSE-NEXT:    orq %rdx, %r8 -; SSE-NEXT:    andq 272(%rdi), %r14 -; SSE-NEXT:    orq %r11, %r8 -; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 16(%rdi), %rax -; SSE-NEXT:    orq %r14, %rax -; SSE-NEXT:    movq %rax, %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    andq 400(%rdi), %rdx -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 144(%rdi), %rax -; SSE-NEXT:    orq %rdx, %rax -; SSE-NEXT:    orq %r8, %rax -; SSE-NEXT:    movq %rax, %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT:    andq 336(%rdi), %r9 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 80(%rdi), %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    andq 464(%rdi), %rdx -; SSE-NEXT:    orq %r9, %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT:    andq 208(%rdi), %r11 -; SSE-NEXT:    orq %rdx, %r11 -; SSE-NEXT:    orq %rax, %r11 -; SSE-NEXT:    orq %r8, %r11 -; SSE-NEXT:    movq (%rsp), %rdx # 8-byte Reload -; SSE-NEXT:    andq 304(%rdi), %rdx -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 48(%rdi), %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT:    andq 432(%rdi), %r9 -; SSE-NEXT:    orq %rdx, %rax -; SSE-NEXT:    movq %rax, %r10 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 176(%rdi), %r8 -; SSE-NEXT:    orq %r9, %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT:    andq 368(%rdi), %r9 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 112(%rdi), %rax -; SSE-NEXT:    orq %r10, %r8 -; SSE-NEXT:    movq %r8, %r10 -; SSE-NEXT:    orq %r9, %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 496(%rdi), %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; SSE-NEXT:    andq 240(%rdi), %rbp -; SSE-NEXT:    orq %r8, %rbp -; SSE-NEXT:    orq %rax, %rbp -; SSE-NEXT:    orq %r10, %rbp -; SSE-NEXT:    orq %r11, %rbp -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 392(%rdi), %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE-NEXT:    andq 136(%rdi), %r12 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    andq 328(%rdi), %rdx -; SSE-NEXT:    orq %rax, %r12 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 72(%rdi), %rax -; SSE-NEXT:    orq %rdx, %rax -; SSE-NEXT:    movq %rax, %rdx -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 456(%rdi), %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE-NEXT:    andq 200(%rdi), %r13 -; SSE-NEXT:    orq %rax, %r13 -; SSE-NEXT:    orq %rdx, %r13 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    andq 296(%rdi), %rdx -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 40(%rdi), %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 424(%rdi), %r8 -; SSE-NEXT:    orq %rdx, %rax -; SSE-NEXT:    movq %rax, %r9 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; SSE-NEXT:    andq 168(%rdi), %rdx -; SSE-NEXT:    orq %r8, %rdx -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 360(%rdi), %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 104(%rdi), %rax -; SSE-NEXT:    orq %r9, %rdx -; SSE-NEXT:    orq %r8, %rax -; SSE-NEXT:    movq %rax, %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 488(%rdi), %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE-NEXT:    andq 232(%rdi), %r15 -; SSE-NEXT:    orq %rax, %r15 -; SSE-NEXT:    orq %r8, %r15 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 280(%rdi), %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 24(%rdi), %rax -; SSE-NEXT:    orq %rdx, %r15 -; SSE-NEXT:    orq %r8, %rax -; SSE-NEXT:    movq %rax, %r10 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 408(%rdi), %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 152(%rdi), %rax -; SSE-NEXT:    orq %r8, %rax -; SSE-NEXT:    orq %r10, %rax -; SSE-NEXT:    movq %rax, %r10 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT:    andq 344(%rdi), %r11 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 88(%rdi), %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 472(%rdi), %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE-NEXT:    andq 216(%rdi), %r14 -; SSE-NEXT:    orq %r11, %r8 -; SSE-NEXT:    orq %rax, %r14 -; SSE-NEXT:    orq %r8, %r14 -; SSE-NEXT:    orq %r10, %r14 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT:    andq 312(%rdi), %r11 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT:    andq 56(%rdi), %r10 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 440(%rdi), %r8 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; SSE-NEXT:    andq 184(%rdi), %r9 -; SSE-NEXT:    orq %r11, %r10 -; SSE-NEXT:    orq %r8, %r9 -; SSE-NEXT:    orq %r10, %r9 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; SSE-NEXT:    movl $1, %eax +; SSE-NEXT:    xorl %edx, %edx  ; SSE-NEXT:    shldq %cl, %rax, %rdx -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE-NEXT:    andq 376(%rdi), %r10 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE-NEXT:    andq 120(%rdi), %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE-NEXT:    andq 504(%rdi), %r11 -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE-NEXT:    andq 248(%rdi), %r8 -; SSE-NEXT:    orq %r10, %rax -; SSE-NEXT:    movq %rax, %r10 -; SSE-NEXT:    orq %r11, %r8 -; SSE-NEXT:    movq 1056(%rsp,%rsi), %rax -; SSE-NEXT:    shldq %cl, %rax, %rbx -; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx  ; SSE-NEXT:    shlq %cl, %rax -; SSE-NEXT:    orq %r10, %r8 -; SSE-NEXT:    orq %r9, %r8 -; SSE-NEXT:    andq 256(%rdi), %rdx -; SSE-NEXT:    orq %r14, %r8 -; SSE-NEXT:    andq (%rdi), %rax +; SSE-NEXT:    xorl %esi, %esi +; SSE-NEXT:    testb $64, %cl +; SSE-NEXT:    cmovneq %rax, %rdx +; SSE-NEXT:    cmovneq %rsi, %rax +; SSE-NEXT:    xorq 8(%rdi), %rdx +; SSE-NEXT:    xorq (%rdi), %rax +; SSE-NEXT:    movq %rax, (%rdi) +; SSE-NEXT:    movq %rdx, 8(%rdi)  ; SSE-NEXT:    orq %rdx, %rax -; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; SSE-NEXT:    orq %rbp, %rax -; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; SSE-NEXT:    andq 264(%rdi), %rcx -; SSE-NEXT:    andq 8(%rdi), %rbx -; SSE-NEXT:    orq %rcx, %rbx -; SSE-NEXT:    orq %r12, %rbx -; SSE-NEXT:    orq %r13, %rbx -; SSE-NEXT:    orq %r15, %rbx -; SSE-NEXT:    orq %r8, %rbx -; SSE-NEXT:    orq %rax, %rbx  ; SSE-NEXT:    setne %al -; SSE-NEXT:    addq $1576, %rsp # imm = 0x628 -; SSE-NEXT:    popq %rbx -; SSE-NEXT:    popq %r12 -; SSE-NEXT:    popq %r13 -; SSE-NEXT:    popq %r14 -; SSE-NEXT:    popq %r15 -; SSE-NEXT:    popq %rbp  ; SSE-NEXT:    retq  ; -; AVX2-LABEL: test_ne_i4096: +; AVX2-LABEL: complement_cmpz_i128:  ; AVX2:       # %bb.0: -; AVX2-NEXT:    pushq %rbp -; AVX2-NEXT:    pushq %r15 -; AVX2-NEXT:    pushq %r14 -; AVX2-NEXT:    pushq %r13 -; AVX2-NEXT:    pushq %r12 -; AVX2-NEXT:    pushq %rbx -; AVX2-NEXT:    subq $1560, %rsp # imm = 0x618  ; AVX2-NEXT:    movl %esi, %ecx -; AVX2-NEXT:    movl %esi, %eax -; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    andl $4032, %eax # imm = 0xFC0 -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0] -; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX2-NEXT:    andl $63, %ecx -; AVX2-NEXT:    shrl $3, %eax -; AVX2-NEXT:    negl %eax -; AVX2-NEXT:    movslq %eax, %rsi -; AVX2-NEXT:    movq 1280(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1288(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1536(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1544(%rsp,%rsi), %rax -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1152(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1160(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1408(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1416(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1216(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, (%rsp) # 8-byte Spill -; AVX2-NEXT:    movq 1224(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1472(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1480(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1088(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1096(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1344(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1352(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1248(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1256(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1504(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1512(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1120(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1128(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1376(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1384(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1184(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1192(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1440(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1448(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1056(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1064(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1312(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1320(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1264(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1272(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1520(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1528(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1136(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1144(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1392(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1400(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1200(%rsp,%rsi), %r11 -; AVX2-NEXT:    movq 1208(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %r11, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1456(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1464(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1072(%rsp,%rsi), %r12 -; AVX2-NEXT:    movq 1080(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %r12, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1328(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1336(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rdx, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1232(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1240(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rax, %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1488(%rsp,%rsi), %rbp -; AVX2-NEXT:    movq 1496(%rsp,%rsi), %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    shldq %cl, %rbp, %rax -; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1104(%rsp,%rsi), %rax -; AVX2-NEXT:    movq 1112(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT:    movl $1, %eax +; AVX2-NEXT:    xorl %edx, %edx  ; AVX2-NEXT:    shldq %cl, %rax, %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1360(%rsp,%rsi), %r10 -; AVX2-NEXT:    movq 1368(%rsp,%rsi), %r8 -; AVX2-NEXT:    movq %r8, %rdx -; AVX2-NEXT:    shldq %cl, %r10, %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1168(%rsp,%rsi), %r9 -; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1176(%rsp,%rsi), %rbx -; AVX2-NEXT:    movq %rbx, %rdx -; AVX2-NEXT:    shldq %cl, %r9, %rdx -; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1424(%rsp,%rsi), %r9 -; AVX2-NEXT:    movq 1432(%rsp,%rsi), %rdx -; AVX2-NEXT:    movq %rdx, %r14 -; AVX2-NEXT:    shldq %cl, %r9, %r14 -; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1296(%rsp,%rsi), %r15 -; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq 1304(%rsp,%rsi), %r14 -; AVX2-NEXT:    movq %r14, %r13 -; AVX2-NEXT:    shldq %cl, %r15, %r13 -; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, (%rsp) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq 1048(%rsp,%rsi), %rdx -; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r14, %rbx -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r14, %r11 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r14, %r12 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r14, %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r14, %r13 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r14, %rbp -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r14, %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r14, %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, %r14 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r15, %r9 -; AVX2-NEXT:    andq 384(%rdi), %r9 -; AVX2-NEXT:    andq 128(%rdi), %r14 -; AVX2-NEXT:    andq 320(%rdi), %r10 -; AVX2-NEXT:    orq %r9, %r14 -; AVX2-NEXT:    movq %r14, %r15 -; AVX2-NEXT:    andq 64(%rdi), %rax -; AVX2-NEXT:    orq %r10, %rax -; AVX2-NEXT:    andq 448(%rdi), %rbp -; AVX2-NEXT:    andq 192(%rdi), %r13 -; AVX2-NEXT:    orq %rbp, %r13 -; AVX2-NEXT:    orq %rax, %r13 -; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT:    andq 288(%rdi), %r8 -; AVX2-NEXT:    andq 32(%rdi), %r12 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 416(%rdi), %rax -; AVX2-NEXT:    orq %r8, %r12 -; AVX2-NEXT:    andq 160(%rdi), %r11 -; AVX2-NEXT:    orq %rax, %r11 -; AVX2-NEXT:    andq 352(%rdi), %rbx -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 96(%rdi), %rax -; AVX2-NEXT:    orq %r12, %r11 -; AVX2-NEXT:    orq %rbx, %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT:    andq 480(%rdi), %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT:    andq 224(%rdi), %r13 -; AVX2-NEXT:    orq %r10, %r13 -; AVX2-NEXT:    orq %rax, %r13 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 272(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 16(%rdi), %rax -; AVX2-NEXT:    orq %r11, %r13 -; AVX2-NEXT:    orq %r8, %rax -; AVX2-NEXT:    movq %rax, %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT:    andq 400(%rdi), %r9 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 144(%rdi), %rax -; AVX2-NEXT:    orq %r9, %rax -; AVX2-NEXT:    orq %r8, %rax -; AVX2-NEXT:    movq %rax, %r9 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT:    andq 336(%rdi), %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 80(%rdi), %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 464(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT:    andq 208(%rdi), %r11 -; AVX2-NEXT:    orq %r10, %rax -; AVX2-NEXT:    orq %r8, %r11 -; AVX2-NEXT:    orq %rax, %r11 -; AVX2-NEXT:    orq %r9, %r11 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT:    andq 304(%rdi), %r9 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 48(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT:    andq 432(%rdi), %r10 -; AVX2-NEXT:    movq (%rsp), %rax # 8-byte Reload -; AVX2-NEXT:    andq 176(%rdi), %rax -; AVX2-NEXT:    orq %r9, %r8 -; AVX2-NEXT:    movq %r8, %r9 -; AVX2-NEXT:    orq %r10, %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 368(%rdi), %r8 -; AVX2-NEXT:    orq %r9, %rax -; AVX2-NEXT:    movq %rax, %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 112(%rdi), %rax -; AVX2-NEXT:    orq %r8, %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 496(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX2-NEXT:    andq 240(%rdi), %r9 -; AVX2-NEXT:    orq %r8, %r9 -; AVX2-NEXT:    orq %rax, %r9 -; AVX2-NEXT:    orq %r10, %r9 -; AVX2-NEXT:    orq %r11, %r9 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT:    andq 392(%rdi), %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX2-NEXT:    andq 136(%rdi), %rbp -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 328(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 72(%rdi), %rax -; AVX2-NEXT:    orq %r10, %rbp -; AVX2-NEXT:    orq %r8, %rax -; AVX2-NEXT:    movq %rax, %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 456(%rdi), %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX2-NEXT:    andq 200(%rdi), %r12 -; AVX2-NEXT:    orq %rax, %r12 -; AVX2-NEXT:    orq %r8, %r12 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT:    andq 296(%rdi), %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 40(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT:    andq 424(%rdi), %r11 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 168(%rdi), %rax -; AVX2-NEXT:    orq %r10, %r8 -; AVX2-NEXT:    movq %r8, %r10 -; AVX2-NEXT:    orq %r11, %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 360(%rdi), %r8 -; AVX2-NEXT:    orq %r10, %rax -; AVX2-NEXT:    movq %rax, %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 104(%rdi), %rax -; AVX2-NEXT:    orq %r8, %rax -; AVX2-NEXT:    movq %rax, %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 488(%rdi), %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT:    andq 232(%rdi), %r14 -; AVX2-NEXT:    orq %rax, %r14 -; AVX2-NEXT:    orq %r8, %r14 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 280(%rdi), %r8 -; AVX2-NEXT:    orq %r10, %r14 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 24(%rdi), %rax -; AVX2-NEXT:    orq %r8, %rax -; AVX2-NEXT:    movq %rax, %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 408(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 152(%rdi), %rax -; AVX2-NEXT:    orq %r8, %rax -; AVX2-NEXT:    orq %r10, %rax -; AVX2-NEXT:    movq %rax, %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT:    andq 344(%rdi), %r11 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 88(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 472(%rdi), %rax -; AVX2-NEXT:    orq %r11, %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT:    andq 216(%rdi), %rbx -; AVX2-NEXT:    orq %rax, %rbx -; AVX2-NEXT:    orq %r8, %rbx -; AVX2-NEXT:    orq %r10, %rbx -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 312(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 56(%rdi), %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT:    andq 440(%rdi), %r10 -; AVX2-NEXT:    orq %r8, %rax -; AVX2-NEXT:    movq %rax, %r11 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 184(%rdi), %r8 -; AVX2-NEXT:    orq %r10, %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT:    andq 376(%rdi), %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 120(%rdi), %rax -; AVX2-NEXT:    orq %r11, %r8 -; AVX2-NEXT:    movq %r8, %r11 -; AVX2-NEXT:    orq %r10, %rax -; AVX2-NEXT:    movq %rax, %r10 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    andq 504(%rdi), %r8 -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX2-NEXT:    andq 248(%rdi), %rax -; AVX2-NEXT:    orq %r8, %rax -; AVX2-NEXT:    orq %r10, %rax -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT:    shldq %cl, %r8, %r10 -; AVX2-NEXT:    orq %r11, %rax -; AVX2-NEXT:    movq 1040(%rsp,%rsi), %rsi -; AVX2-NEXT:    orq %rbx, %rax -; AVX2-NEXT:    movq %rax, %r8 -; AVX2-NEXT:    shlxq %rcx, %rsi, %rax -; AVX2-NEXT:    andq 256(%rdi), %r10 -; AVX2-NEXT:    andq (%rdi), %rax -; AVX2-NEXT:    orq %r10, %rax -; AVX2-NEXT:    orq %r15, %rax -; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload -; AVX2-NEXT:    orq %r13, %rax -; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx -; AVX2-NEXT:    shldq %cl, %rsi, %rdx -; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT:    andq 264(%rdi), %rcx -; AVX2-NEXT:    andq 8(%rdi), %rdx -; AVX2-NEXT:    orq %r9, %rax -; AVX2-NEXT:    orq %rcx, %rdx -; AVX2-NEXT:    orq %rbp, %rdx -; AVX2-NEXT:    orq %r12, %rdx -; AVX2-NEXT:    orq %r14, %rdx -; AVX2-NEXT:    orq %r8, %rdx -; AVX2-NEXT:    orq %rax, %rdx +; AVX2-NEXT:    xorl %esi, %esi +; AVX2-NEXT:    shlxq %rcx, %rax, %rax +; AVX2-NEXT:    testb $64, %cl +; AVX2-NEXT:    cmovneq %rax, %rdx +; AVX2-NEXT:    cmovneq %rsi, %rax +; AVX2-NEXT:    xorq 8(%rdi), %rdx +; AVX2-NEXT:    xorq (%rdi), %rax +; AVX2-NEXT:    movq %rax, (%rdi) +; AVX2-NEXT:    movq %rdx, 8(%rdi) +; AVX2-NEXT:    orq %rdx, %rax  ; AVX2-NEXT:    setne %al -; AVX2-NEXT:    addq $1560, %rsp # imm = 0x618 -; AVX2-NEXT:    popq %rbx -; AVX2-NEXT:    popq %r12 -; AVX2-NEXT:    popq %r13 -; AVX2-NEXT:    popq %r14 -; AVX2-NEXT:    popq %r15 -; AVX2-NEXT:    popq %rbp -; AVX2-NEXT:    vzeroupper  ; AVX2-NEXT:    retq  ; -; AVX512-LABEL: test_ne_i4096: +; AVX512-LABEL: complement_cmpz_i128:  ; AVX512:       # %bb.0: -; AVX512-NEXT:    pushq %rbp -; AVX512-NEXT:    pushq %r15 -; AVX512-NEXT:    pushq %r14 -; AVX512-NEXT:    pushq %r13 -; AVX512-NEXT:    pushq %r12 -; AVX512-NEXT:    pushq %rbx -; AVX512-NEXT:    subq $1560, %rsp # imm = 0x618  ; AVX512-NEXT:    movl %esi, %ecx -; AVX512-NEXT:    movl %esi, %eax -; AVX512-NEXT:    andl $4032, %eax # imm = 0xFC0 -; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0] -; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp) -; AVX512-NEXT:    andl $63, %ecx -; AVX512-NEXT:    shrl $3, %eax -; AVX512-NEXT:    negl %eax -; AVX512-NEXT:    movslq %eax, %rsi -; AVX512-NEXT:    movq 1280(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1288(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1536(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1544(%rsp,%rsi), %rax -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1152(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1160(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1408(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1416(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1216(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, (%rsp) # 8-byte Spill -; AVX512-NEXT:    movq 1224(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1472(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1480(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1088(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1096(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1344(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1352(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1248(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1256(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1504(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1512(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1120(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1128(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1376(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1384(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1184(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1192(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1440(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1448(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1056(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1064(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1312(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1320(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1264(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1272(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1520(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1528(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1136(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1144(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1392(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1400(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1200(%rsp,%rsi), %r10 -; AVX512-NEXT:    movq 1208(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %r10, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1456(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1464(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1072(%rsp,%rsi), %r14 -; AVX512-NEXT:    movq 1080(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %r14, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1328(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1336(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rdx, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1232(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1240(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rax, %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1488(%rsp,%rsi), %r12 -; AVX512-NEXT:    movq 1496(%rsp,%rsi), %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %r12, %rax -; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1104(%rsp,%rsi), %rax -; AVX512-NEXT:    movq 1112(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    shldq %cl, %rax, %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1360(%rsp,%rsi), %r11 -; AVX512-NEXT:    movq 1368(%rsp,%rsi), %rbx -; AVX512-NEXT:    movq %rbx, %rdx -; AVX512-NEXT:    shldq %cl, %r11, %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1168(%rsp,%rsi), %r9 -; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1176(%rsp,%rsi), %r8 -; AVX512-NEXT:    movq %r8, %rdx -; AVX512-NEXT:    shldq %cl, %r9, %rdx -; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1424(%rsp,%rsi), %r9 -; AVX512-NEXT:    movq 1432(%rsp,%rsi), %rdx -; AVX512-NEXT:    movq %rdx, %r15 -; AVX512-NEXT:    shldq %cl, %r9, %r15 -; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1296(%rsp,%rsi), %rbp -; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq 1304(%rsp,%rsi), %r15 -; AVX512-NEXT:    movq %r15, %r13 -; AVX512-NEXT:    shldq %cl, %rbp, %r13 -; AVX512-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, (%rsp) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq 1048(%rsp,%rsi), %rdx -; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r15, %rbx -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r15, %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r15, %r14 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r15, %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r15, %r13 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r15, %r12 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r15, %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %r15, %r11 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %rbp, %r15 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %rbp, %r9 -; AVX512-NEXT:    andq 384(%rdi), %r9 -; AVX512-NEXT:    andq 128(%rdi), %r15 -; AVX512-NEXT:    orq %r9, %r15 -; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT:    andq 320(%rdi), %r11 -; AVX512-NEXT:    andq 64(%rdi), %rax -; AVX512-NEXT:    orq %r11, %rax -; AVX512-NEXT:    andq 448(%rdi), %r12 -; AVX512-NEXT:    andq 192(%rdi), %r13 -; AVX512-NEXT:    orq %r12, %r13 -; AVX512-NEXT:    orq %rax, %r13 -; AVX512-NEXT:    andq 288(%rdi), %r8 -; AVX512-NEXT:    andq 32(%rdi), %r14 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 416(%rdi), %rax -; AVX512-NEXT:    orq %r8, %r14 -; AVX512-NEXT:    andq 160(%rdi), %r10 -; AVX512-NEXT:    orq %rax, %r10 -; AVX512-NEXT:    andq 352(%rdi), %rbx -; AVX512-NEXT:    orq %r14, %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 96(%rdi), %rax -; AVX512-NEXT:    orq %rbx, %rax -; AVX512-NEXT:    movq %rax, %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 480(%rdi), %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT:    andq 224(%rdi), %r15 -; AVX512-NEXT:    orq %rax, %r15 -; AVX512-NEXT:    orq %r8, %r15 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 272(%rdi), %r8 -; AVX512-NEXT:    orq %r10, %r15 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 16(%rdi), %rax -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    movq %rax, %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT:    andq 400(%rdi), %r9 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 144(%rdi), %rax -; AVX512-NEXT:    orq %r9, %rax -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    movq %rax, %r9 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT:    andq 336(%rdi), %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 80(%rdi), %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 464(%rdi), %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT:    andq 208(%rdi), %r11 -; AVX512-NEXT:    orq %r10, %rax -; AVX512-NEXT:    orq %r8, %r11 -; AVX512-NEXT:    orq %rax, %r11 -; AVX512-NEXT:    orq %r9, %r11 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT:    andq 304(%rdi), %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 48(%rdi), %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT:    andq 432(%rdi), %r9 -; AVX512-NEXT:    movq (%rsp), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 176(%rdi), %r8 -; AVX512-NEXT:    orq %r10, %rax -; AVX512-NEXT:    movq %rax, %r10 -; AVX512-NEXT:    orq %r9, %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT:    andq 368(%rdi), %r9 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 112(%rdi), %rax -; AVX512-NEXT:    orq %r10, %r8 -; AVX512-NEXT:    movq %r8, %r10 -; AVX512-NEXT:    orq %r9, %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 496(%rdi), %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX512-NEXT:    andq 240(%rdi), %r9 -; AVX512-NEXT:    orq %r8, %r9 -; AVX512-NEXT:    orq %rax, %r9 -; AVX512-NEXT:    orq %r10, %r9 -; AVX512-NEXT:    orq %r11, %r9 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT:    andq 392(%rdi), %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT:    andq 136(%rdi), %rbp -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 328(%rdi), %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 72(%rdi), %rax -; AVX512-NEXT:    orq %r10, %rbp -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    movq %rax, %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 456(%rdi), %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX512-NEXT:    andq 200(%rdi), %r12 -; AVX512-NEXT:    orq %rax, %r12 -; AVX512-NEXT:    orq %r8, %r12 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 296(%rdi), %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 40(%rdi), %rax -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    movq %rax, %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 424(%rdi), %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 168(%rdi), %rax -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    orq %r10, %rax -; AVX512-NEXT:    movq %rax, %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 360(%rdi), %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 104(%rdi), %rax -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    movq %rax, %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 488(%rdi), %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT:    andq 232(%rdi), %r14 -; AVX512-NEXT:    orq %rax, %r14 -; AVX512-NEXT:    orq %r8, %r14 -; AVX512-NEXT:    orq %r10, %r14 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 280(%rdi), %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 24(%rdi), %rax -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    movq %rax, %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 408(%rdi), %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 152(%rdi), %rax -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    orq %r10, %rax -; AVX512-NEXT:    movq %rax, %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT:    andq 344(%rdi), %r11 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 88(%rdi), %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 472(%rdi), %rax -; AVX512-NEXT:    orq %r11, %r8 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT:    andq 216(%rdi), %rbx -; AVX512-NEXT:    orq %rax, %rbx -; AVX512-NEXT:    orq %r8, %rbx -; AVX512-NEXT:    orq %r10, %rbx -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT:    andq 312(%rdi), %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 56(%rdi), %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 440(%rdi), %r8 -; AVX512-NEXT:    orq %r10, %rax -; AVX512-NEXT:    movq %rax, %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 184(%rdi), %rax -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 376(%rdi), %r8 -; AVX512-NEXT:    orq %r10, %rax -; AVX512-NEXT:    movq %rax, %r11 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 120(%rdi), %rax -; AVX512-NEXT:    orq %r8, %rax -; AVX512-NEXT:    movq %rax, %r10 -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 504(%rdi), %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT:    andq 248(%rdi), %r8 -; AVX512-NEXT:    orq %rax, %r8 -; AVX512-NEXT:    orq %r10, %r8 -; AVX512-NEXT:    orq %r11, %r8 -; AVX512-NEXT:    movq 1040(%rsp,%rsi), %rax -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT:    shldq %cl, %rsi, %r10 -; AVX512-NEXT:    orq %rbx, %r8 -; AVX512-NEXT:    shlxq %rcx, %rax, %rsi -; AVX512-NEXT:    andq 256(%rdi), %r10 -; AVX512-NEXT:    andq (%rdi), %rsi -; AVX512-NEXT:    orq %r10, %rsi -; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload -; AVX512-NEXT:    orq %r13, %rsi -; AVX512-NEXT:    orq %r15, %rsi -; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx -; AVX512-NEXT:    shldq %cl, %rax, %rdx -; AVX512-NEXT:    orq %r9, %rsi -; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT:    andq 264(%rdi), %rax -; AVX512-NEXT:    andq 8(%rdi), %rdx -; AVX512-NEXT:    orq %rax, %rdx -; AVX512-NEXT:    orq %rbp, %rdx -; AVX512-NEXT:    orq %r12, %rdx -; AVX512-NEXT:    orq %r14, %rdx -; AVX512-NEXT:    orq %r8, %rdx +; AVX512-NEXT:    xorl %eax, %eax +; AVX512-NEXT:    movl $1, %edx +; AVX512-NEXT:    xorl %esi, %esi +; AVX512-NEXT:    shldq %cl, %rdx, %rsi +; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx +; AVX512-NEXT:    testb $64, %cl +; AVX512-NEXT:    cmovneq %rdx, %rsi +; AVX512-NEXT:    cmovneq %rax, %rdx +; AVX512-NEXT:    xorq 8(%rdi), %rsi +; AVX512-NEXT:    xorq (%rdi), %rdx +; AVX512-NEXT:    movq %rdx, (%rdi) +; AVX512-NEXT:    movq %rsi, 8(%rdi)  ; AVX512-NEXT:    orq %rsi, %rdx  ; AVX512-NEXT:    setne %al -; AVX512-NEXT:    addq $1560, %rsp # imm = 0x618 -; AVX512-NEXT:    popq %rbx -; AVX512-NEXT:    popq %r12 -; AVX512-NEXT:    popq %r13 -; AVX512-NEXT:    popq %r14 -; AVX512-NEXT:    popq %r15 -; AVX512-NEXT:    popq %rbp -; AVX512-NEXT:    vzeroupper  ; AVX512-NEXT:    retq -  %rem = and i32 %position, 4095 -  %ofs = zext nneg i32 %rem to i4096 -  %bit = shl nuw i4096 1, %ofs -  %ld = load i4096, ptr %word -  %test = and i4096 %ld, %bit -  %cmp = icmp ne i4096 %test, 0 +  %rem = and i32 %position, 127 +  %ofs = zext nneg i32 %rem to i128 +  %bit = shl nuw i128 1, %ofs +  %ld = load i128, ptr %word +  %res = xor i128 %ld, %bit +  store i128 %res, ptr %word +  %cmp = icmp ne i128 %res, 0    ret i1 %cmp  } + +; Multiple loads in store chain +define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { +; X86-LABEL: reset_multiload_i128: +; X86:       # %bb.0: +; X86-NEXT:    pushl %ebp +; X86-NEXT:    movl %esp, %ebp +; X86-NEXT:    pushl %ebx +; X86-NEXT:    pushl %edi +; X86-NEXT:    pushl %esi +; X86-NEXT:    andl $-16, %esp +; X86-NEXT:    subl $64, %esp +; X86-NEXT:    movl 12(%ebp), %ecx +; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) +; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) +; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) +; X86-NEXT:    movl $1, {{[0-9]+}}(%esp) +; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) +; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) +; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) +; X86-NEXT:    movl $0, {{[0-9]+}}(%esp) +; X86-NEXT:    movl %ecx, %eax +; X86-NEXT:    shrb $3, %al +; X86-NEXT:    andb $12, %al +; X86-NEXT:    negb %al +; X86-NEXT:    movsbl %al, %edi +; X86-NEXT:    movl 36(%esp,%edi), %edx +; X86-NEXT:    movl 40(%esp,%edi), %ebx +; X86-NEXT:    movl %ebx, %esi +; X86-NEXT:    shldl %cl, %edx, %esi +; X86-NEXT:    movl 32(%esp,%edi), %eax +; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 44(%esp,%edi), %edi +; X86-NEXT:    shldl %cl, %ebx, %edi +; X86-NEXT:    movl %eax, %ebx +; X86-NEXT:    # kill: def $cl killed $cl killed $ecx +; X86-NEXT:    shll %cl, %ebx +; X86-NEXT:    notl %ebx +; X86-NEXT:    movl 16(%ebp), %eax +; X86-NEXT:    movl (%eax), %eax +; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT:    movl 12(%ebp), %eax +; X86-NEXT:    andl $96, %eax +; X86-NEXT:    shrl $3, %eax +; X86-NEXT:    movl 8(%ebp), %ecx +; X86-NEXT:    movl (%ecx,%eax), %eax +; X86-NEXT:    andl %ebx, (%ecx) +; X86-NEXT:    movl 12(%ebp), %ecx +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT:    shldl %cl, %ebx, %edx +; X86-NEXT:    notl %edx +; X86-NEXT:    movl 8(%ebp), %ebx +; X86-NEXT:    andl %edx, 4(%ebx) +; X86-NEXT:    notl %esi +; X86-NEXT:    andl %esi, 8(%ebx) +; X86-NEXT:    notl %edi +; X86-NEXT:    andl %edi, 12(%ebx) +; X86-NEXT:    btl %ecx, %eax +; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT:    jae .LBB22_2 +; X86-NEXT:  # %bb.1: +; X86-NEXT:    xorl %eax, %eax +; X86-NEXT:  .LBB22_2: +; X86-NEXT:    leal -12(%ebp), %esp +; X86-NEXT:    popl %esi +; X86-NEXT:    popl %edi +; X86-NEXT:    popl %ebx +; X86-NEXT:    popl %ebp +; X86-NEXT:    retl +; +; SSE-LABEL: reset_multiload_i128: +; SSE:       # %bb.0: +; SSE-NEXT:    movl %esi, %ecx +; SSE-NEXT:    movl $1, %esi +; SSE-NEXT:    xorl %r8d, %r8d +; SSE-NEXT:    shldq %cl, %rsi, %r8 +; SSE-NEXT:    xorl %eax, %eax +; SSE-NEXT:    shlq %cl, %rsi +; SSE-NEXT:    testb $64, %cl +; SSE-NEXT:    cmovneq %rsi, %r8 +; SSE-NEXT:    cmovneq %rax, %rsi +; SSE-NEXT:    notq %r8 +; SSE-NEXT:    notq %rsi +; SSE-NEXT:    movl %ecx, %r9d +; SSE-NEXT:    andl $96, %r9d +; SSE-NEXT:    shrl $3, %r9d +; SSE-NEXT:    movl (%rdi,%r9), %r9d +; SSE-NEXT:    btl %ecx, %r9d +; SSE-NEXT:    jb .LBB22_2 +; SSE-NEXT:  # %bb.1: +; SSE-NEXT:    movl (%rdx), %eax +; SSE-NEXT:  .LBB22_2: +; SSE-NEXT:    andq %r8, 8(%rdi) +; SSE-NEXT:    andq %rsi, (%rdi) +; SSE-NEXT:    # kill: def $eax killed $eax killed $rax +; SSE-NEXT:    retq +; +; AVX2-LABEL: reset_multiload_i128: +; AVX2:       # %bb.0: +; AVX2-NEXT:    movl %esi, %ecx +; AVX2-NEXT:    xorl %eax, %eax +; AVX2-NEXT:    movl $1, %r8d +; AVX2-NEXT:    xorl %esi, %esi +; AVX2-NEXT:    shldq %cl, %r8, %rsi +; AVX2-NEXT:    shlxq %rcx, %r8, %r8 +; AVX2-NEXT:    testb $64, %cl +; AVX2-NEXT:    cmovneq %r8, %rsi +; AVX2-NEXT:    cmovneq %rax, %r8 +; AVX2-NEXT:    notq %rsi +; AVX2-NEXT:    notq %r8 +; AVX2-NEXT:    movl %ecx, %r9d +; AVX2-NEXT:    andl $96, %r9d +; AVX2-NEXT:    shrl $3, %r9d +; AVX2-NEXT:    movl (%rdi,%r9), %r9d +; AVX2-NEXT:    btl %ecx, %r9d +; AVX2-NEXT:    jb .LBB22_2 +; AVX2-NEXT:  # %bb.1: +; AVX2-NEXT:    movl (%rdx), %eax +; AVX2-NEXT:  .LBB22_2: +; AVX2-NEXT:    andq %rsi, 8(%rdi) +; AVX2-NEXT:    andq %r8, (%rdi) +; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax +; AVX2-NEXT:    retq +; +; AVX512-LABEL: reset_multiload_i128: +; AVX512:       # %bb.0: +; AVX512-NEXT:    movl %esi, %ecx +; AVX512-NEXT:    movl $1, %r8d +; AVX512-NEXT:    xorl %esi, %esi +; AVX512-NEXT:    shldq %cl, %r8, %rsi +; AVX512-NEXT:    xorl %eax, %eax +; AVX512-NEXT:    shlxq %rcx, %r8, %r8 +; AVX512-NEXT:    testb $64, %cl +; AVX512-NEXT:    cmovneq %r8, %rsi +; AVX512-NEXT:    cmovneq %rax, %r8 +; AVX512-NEXT:    notq %rsi +; AVX512-NEXT:    notq %r8 +; AVX512-NEXT:    movl %ecx, %r9d +; AVX512-NEXT:    andl $96, %r9d +; AVX512-NEXT:    shrl $3, %r9d +; AVX512-NEXT:    movl (%rdi,%r9), %r9d +; AVX512-NEXT:    btl %ecx, %r9d +; AVX512-NEXT:    jb .LBB22_2 +; AVX512-NEXT:  # %bb.1: +; AVX512-NEXT:    movl (%rdx), %eax +; AVX512-NEXT:  .LBB22_2: +; AVX512-NEXT:    andq %rsi, 8(%rdi) +; AVX512-NEXT:    andq %r8, (%rdi) +; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax +; AVX512-NEXT:    retq +  %rem = and i32 %position, 127 +  %ofs = zext nneg i32 %rem to i128 +  %bit = shl nuw i128 1, %ofs +  %mask = xor i128 %bit, -1 +  %ld = load i128, ptr %word +  %sel = load i32, ptr %p +  %test = and i128 %ld, %bit +  %res = and i128 %ld, %mask +  %cmp = icmp eq i128 %test, 0 +  store i128 %res, ptr %word +  %ret = select i1 %cmp, i32 %sel, i32 0 +  ret i32 %ret +} | 
