aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/X86
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/X86')
-rw-r--r--llvm/test/CodeGen/X86/bittest-big-integer.ll1005
-rw-r--r--llvm/test/CodeGen/X86/isel-llvm.sincos.ll133
-rw-r--r--llvm/test/CodeGen/X86/llvm.sincos.vec.ll404
3 files changed, 1349 insertions, 193 deletions
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index dffe900..c311ab8 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -356,20 +356,41 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%edx,%esi), %edi
-; X86-NEXT: btl %ecx, %edi
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB9_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl $0, %edx
+; X86-NEXT: .LBB9_2:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: notl %esi
+; X86-NEXT: notl %edx
+; X86-NEXT: je .LBB9_4
+; X86-NEXT: # %bb.3:
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB9_4:
+; X86-NEXT: andl 4(%ebx), %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: andl (%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: andl $32, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl (%ebx,%eax), %eax
+; X86-NEXT: btl %ecx, %eax
; X86-NEXT: setae %al
-; X86-NEXT: btrl %ecx, %edi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, (%edx,%esi)
+; X86-NEXT: movl %edx, (%ebx)
+; X86-NEXT: movl %esi, 4(%ebx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -579,55 +600,208 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%edx,%esi), %edi
-; X86-NEXT: btl %ecx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %ecx, %edi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $96, %esp
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movzbl 16(%ebp), %ebx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 64(%esp,%eax), %edx
+; X86-NEXT: movl 68(%esp,%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movl 72(%esp,%esi), %ebx
+; X86-NEXT: movl 76(%esp,%esi), %esi
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: shldl %cl, %ebx, %esi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, (%edx,%esi)
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: notl %edi
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: movl 36(%esp,%ecx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%esp,%ecx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: andl 8(%eax), %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: notl %esi
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl 44(%esp,%eax), %eax
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: andl 12(%ecx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl 32(%esp,%eax), %edx
+; X86-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: andl (%eax), %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%esp), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: andl 4(%ecx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: andl $96, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: movl (%ecx,%eax), %eax
+; X86-NEXT: btl %esi, %eax
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %edi, 8(%ecx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %ebx, (%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: setae %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $96, %esi
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: movl (%rdi,%rsi), %r8d
-; SSE-NEXT: btl %ecx, %r8d
+; SSE-NEXT: movl $1, %esi
+; SSE-NEXT: xorl %r8d, %r8d
+; SSE-NEXT: shldq %cl, %rsi, %r8
+; SSE-NEXT: shlq %cl, %rsi
+; SSE-NEXT: movl %edx, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: shlq %cl, %rax
+; SSE-NEXT: xorl %r9d, %r9d
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rsi, %r8
+; SSE-NEXT: cmovneq %r9, %rsi
+; SSE-NEXT: notq %r8
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: cmovneq %r9, %rax
+; SSE-NEXT: notq %rsi
+; SSE-NEXT: andq 8(%rdi), %r8
+; SSE-NEXT: orq %rdx, %r8
+; SSE-NEXT: andq (%rdi), %rsi
+; SSE-NEXT: orq %rax, %rsi
+; SSE-NEXT: movl %ecx, %eax
+; SSE-NEXT: andl $96, %eax
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: movl (%rdi,%rax), %eax
+; SSE-NEXT: btl %ecx, %eax
; SSE-NEXT: setae %al
-; SSE-NEXT: shll %cl, %edx
-; SSE-NEXT: btrl %ecx, %r8d
-; SSE-NEXT: orl %r8d, %edx
-; SSE-NEXT: movl %edx, (%rdi,%rsi)
+; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: retq
;
-; AVX-LABEL: init_eq_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: andl $96, %ecx
-; AVX-NEXT: shrl $3, %ecx
-; AVX-NEXT: movl (%rdi,%rcx), %r8d
-; AVX-NEXT: btl %esi, %r8d
-; AVX-NEXT: setae %al
-; AVX-NEXT: btrl %esi, %r8d
-; AVX-NEXT: shlxl %esi, %edx, %edx
-; AVX-NEXT: orl %r8d, %edx
-; AVX-NEXT: movl %edx, (%rdi,%rcx)
-; AVX-NEXT: retq
+; AVX2-LABEL: init_eq_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: movl $1, %eax
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: shldq %cl, %rax, %rsi
+; AVX2-NEXT: movl %edx, %edx
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: shldq %cl, %rdx, %r8
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: shlxq %rcx, %rax, %rax
+; AVX2-NEXT: testb $64, %cl
+; AVX2-NEXT: cmovneq %rax, %rsi
+; AVX2-NEXT: cmovneq %r9, %rax
+; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX2-NEXT: cmovneq %rdx, %r8
+; AVX2-NEXT: cmovneq %r9, %rdx
+; AVX2-NEXT: andnq 8(%rdi), %rsi, %rsi
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: andnq (%rdi), %rax, %r8
+; AVX2-NEXT: orq %rdx, %r8
+; AVX2-NEXT: movl %ecx, %eax
+; AVX2-NEXT: andl $96, %eax
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: movl (%rdi,%rax), %eax
+; AVX2-NEXT: btl %ecx, %eax
+; AVX2-NEXT: setae %al
+; AVX2-NEXT: movq %r8, (%rdi)
+; AVX2-NEXT: movq %rsi, 8(%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: init_eq_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: movl $1, %eax
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: shldq %cl, %rax, %rsi
+; AVX512-NEXT: xorl %r8d, %r8d
+; AVX512-NEXT: shlxq %rcx, %rax, %rax
+; AVX512-NEXT: movl %edx, %edx
+; AVX512-NEXT: xorl %r9d, %r9d
+; AVX512-NEXT: shldq %cl, %rdx, %r9
+; AVX512-NEXT: testb $64, %cl
+; AVX512-NEXT: cmovneq %rax, %rsi
+; AVX512-NEXT: cmovneq %r8, %rax
+; AVX512-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX512-NEXT: cmovneq %rdx, %r9
+; AVX512-NEXT: cmovneq %r8, %rdx
+; AVX512-NEXT: andnq 8(%rdi), %rsi, %rsi
+; AVX512-NEXT: orq %r9, %rsi
+; AVX512-NEXT: andnq (%rdi), %rax, %r8
+; AVX512-NEXT: orq %rdx, %r8
+; AVX512-NEXT: movl %ecx, %eax
+; AVX512-NEXT: andl $96, %eax
+; AVX512-NEXT: shrl $3, %eax
+; AVX512-NEXT: movl (%rdi,%rax), %eax
+; AVX512-NEXT: btl %ecx, %eax
+; AVX512-NEXT: setae %al
+; AVX512-NEXT: movq %r8, (%rdi)
+; AVX512-NEXT: movq %rsi, 8(%rdi)
+; AVX512-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -803,55 +977,673 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i512:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: andl $60, %esi
-; X86-NEXT: movl (%edx,%esi), %edi
-; X86-NEXT: btl %ecx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %ecx, %edi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $352, %esp # imm = 0x160
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: andl $60, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: subl %edx, %eax
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 56(%eax), %esi
+; X86-NEXT: movl 60(%eax), %ebx
+; X86-NEXT: movl 52(%eax), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%eax), %edi
+; X86-NEXT: movl 44(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl 16(%ebp), %eax
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shldl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 56(%eax), %esi
+; X86-NEXT: movl 60(%eax), %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: andl 60(%edx), %ebx
; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, (%edx,%esi)
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 52(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 56(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 48(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 52(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 44(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 48(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 40(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 44(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 36(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 40(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 32(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 36(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 28(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 32(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 24(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 28(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 20(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 24(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 16(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 20(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 12(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 16(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 8(%eax), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: andl 12(%edx), %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl 4(%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: andl 8(%edx), %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: notl %esi
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: andl 4(%edx), %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: notl %esi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: andl (%edx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%edx,%eax), %eax
+; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 60(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 56(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 52(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 48(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 44(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 40(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 36(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 32(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 20(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 16(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%edx)
+; X86-NEXT: movl %ebx, 8(%edx)
+; X86-NEXT: movl %edi, 4(%edx)
+; X86-NEXT: movl %esi, (%edx)
+; X86-NEXT: setae %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i512:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $168, %rsp
+; SSE-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $60, %esi
-; SSE-NEXT: movl (%rdi,%rsi), %r8d
-; SSE-NEXT: btl %ecx, %r8d
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: movl %esi, %eax
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE-NEXT: andl $56, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: movslq %eax, %r12
+; SSE-NEXT: movq 136(%rsp,%r12), %r9
+; SSE-NEXT: movq 144(%rsp,%r12), %rax
+; SSE-NEXT: movq %rax, %rsi
+; SSE-NEXT: shldq %cl, %r9, %rsi
+; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 152(%rsp,%r12), %r11
+; SSE-NEXT: shldq %cl, %rax, %r11
+; SSE-NEXT: movq 120(%rsp,%r12), %r10
+; SSE-NEXT: movq 128(%rsp,%r12), %rax
+; SSE-NEXT: movq %rax, %rbx
+; SSE-NEXT: shldq %cl, %r10, %rbx
+; SSE-NEXT: shldq %cl, %rax, %r9
+; SSE-NEXT: movq 104(%rsp,%r12), %r14
+; SSE-NEXT: movq 112(%rsp,%r12), %rax
+; SSE-NEXT: movq %rax, %r15
+; SSE-NEXT: shldq %cl, %r14, %r15
+; SSE-NEXT: shldq %cl, %rax, %r10
+; SSE-NEXT: movq 96(%rsp,%r12), %rax
+; SSE-NEXT: movq %rax, %r13
+; SSE-NEXT: shlq %cl, %r13
+; SSE-NEXT: shldq %cl, %rax, %r14
+; SSE-NEXT: movl %edx, %eax
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq 8(%rsp,%r12), %r8
+; SSE-NEXT: movq 16(%rsp,%r12), %rsi
+; SSE-NEXT: movq %rsi, %rbp
+; SSE-NEXT: shldq %cl, %r8, %rbp
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: notq %rax
+; SSE-NEXT: andq 48(%rdi), %rax
+; SSE-NEXT: orq %rbp, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: notq %rbx
+; SSE-NEXT: notq %r11
+; SSE-NEXT: movq 24(%rsp,%r12), %rax
+; SSE-NEXT: shldq %cl, %rsi, %rax
+; SSE-NEXT: movq -8(%rsp,%r12), %rbp
+; SSE-NEXT: movq (%rsp,%r12), %rdx
+; SSE-NEXT: movq %rdx, %rsi
+; SSE-NEXT: shldq %cl, %rbp, %rsi
+; SSE-NEXT: andq 56(%rdi), %r11
+; SSE-NEXT: andq 32(%rdi), %rbx
+; SSE-NEXT: orq %rax, %r11
+; SSE-NEXT: orq %rsi, %rbx
+; SSE-NEXT: notq %r15
+; SSE-NEXT: shldq %cl, %rdx, %r8
+; SSE-NEXT: notq %r9
+; SSE-NEXT: andq 40(%rdi), %r9
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: movq -24(%rsp,%r12), %rax
+; SSE-NEXT: movq -16(%rsp,%r12), %rdx
+; SSE-NEXT: movq %rdx, %rsi
+; SSE-NEXT: shldq %cl, %rax, %rsi
+; SSE-NEXT: andq 16(%rdi), %r15
+; SSE-NEXT: orq %rsi, %r15
+; SSE-NEXT: shldq %cl, %rdx, %rbp
+; SSE-NEXT: notq %r10
+; SSE-NEXT: notq %r13
+; SSE-NEXT: movq -32(%rsp,%r12), %rdx
+; SSE-NEXT: movq %rdx, %rsi
+; SSE-NEXT: shlq %cl, %rsi
+; SSE-NEXT: andq 24(%rdi), %r10
+; SSE-NEXT: andq (%rdi), %r13
+; SSE-NEXT: orq %rbp, %r10
+; SSE-NEXT: orq %rsi, %r13
+; SSE-NEXT: notq %r14
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: andq 8(%rdi), %r14
+; SSE-NEXT: orq %rax, %r14
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andl $60, %eax
+; SSE-NEXT: movl (%rdi,%rax), %eax
+; SSE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload
+; SSE-NEXT: btl %ecx, %eax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: movq %rax, 48(%rdi)
+; SSE-NEXT: movq %r11, 56(%rdi)
+; SSE-NEXT: movq %rbx, 32(%rdi)
+; SSE-NEXT: movq %r9, 40(%rdi)
+; SSE-NEXT: movq %r15, 16(%rdi)
+; SSE-NEXT: movq %r10, 24(%rdi)
+; SSE-NEXT: movq %r13, (%rdi)
+; SSE-NEXT: movq %r14, 8(%rdi)
; SSE-NEXT: setae %al
-; SSE-NEXT: shll %cl, %edx
-; SSE-NEXT: btrl %ecx, %r8d
-; SSE-NEXT: orl %r8d, %edx
-; SSE-NEXT: movl %edx, (%rdi,%rsi)
+; SSE-NEXT: addq $168, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
-; AVX-LABEL: init_eq_i512:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: shrl $3, %ecx
-; AVX-NEXT: andl $60, %ecx
-; AVX-NEXT: movl (%rdi,%rcx), %r8d
-; AVX-NEXT: btl %esi, %r8d
-; AVX-NEXT: setae %al
-; AVX-NEXT: btrl %esi, %r8d
-; AVX-NEXT: shlxl %esi, %edx, %edx
-; AVX-NEXT: orl %r8d, %edx
-; AVX-NEXT: movl %edx, (%rdi,%rcx)
-; AVX-NEXT: retq
+; AVX2-LABEL: init_eq_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $184, %rsp
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: movl %esi, %ebx
+; AVX2-NEXT: shrl $3, %ebx
+; AVX2-NEXT: movl %ebx, %eax
+; AVX2-NEXT: andl $56, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: movslq %eax, %r11
+; AVX2-NEXT: movq 128(%rsp,%r11), %r15
+; AVX2-NEXT: movq 136(%rsp,%r11), %rax
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: shldq %cl, %r15, %rsi
+; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 120(%rsp,%r11), %r8
+; AVX2-NEXT: shldq %cl, %r8, %r15
+; AVX2-NEXT: movq 144(%rsp,%r11), %r14
+; AVX2-NEXT: movq 152(%rsp,%r11), %rsi
+; AVX2-NEXT: movq %rsi, %r9
+; AVX2-NEXT: shldq %cl, %r14, %r9
+; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rax, %r14
+; AVX2-NEXT: movq 112(%rsp,%r11), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 160(%rsp,%r11), %r13
+; AVX2-NEXT: movq 168(%rsp,%r11), %r12
+; AVX2-NEXT: shldq %cl, %r13, %r12
+; AVX2-NEXT: shldq %cl, %rsi, %r13
+; AVX2-NEXT: shldq %cl, %rax, %r8
+; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl %edx, %eax
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq 24(%rsp,%r11), %rbp
+; AVX2-NEXT: movq 32(%rsp,%r11), %rdx
+; AVX2-NEXT: movq %rdx, %rax
+; AVX2-NEXT: shldq %cl, %rbp, %rax
+; AVX2-NEXT: movq 40(%rsp,%r11), %r10
+; AVX2-NEXT: shldq %cl, %rdx, %r10
+; AVX2-NEXT: movq 8(%rsp,%r11), %r9
+; AVX2-NEXT: movq 16(%rsp,%r11), %rdx
+; AVX2-NEXT: movq %rdx, %r8
+; AVX2-NEXT: shldq %cl, %r9, %r8
+; AVX2-NEXT: shldq %cl, %rdx, %rbp
+; AVX2-NEXT: andnq 48(%rdi), %r13, %r13
+; AVX2-NEXT: orq %rax, %r13
+; AVX2-NEXT: movq -8(%rsp,%r11), %rax
+; AVX2-NEXT: movq (%rsp,%r11), %rdx
+; AVX2-NEXT: movq %rdx, %rsi
+; AVX2-NEXT: shldq %cl, %rax, %rsi
+; AVX2-NEXT: shldq %cl, %rdx, %r9
+; AVX2-NEXT: andnq 56(%rdi), %r12, %r12
+; AVX2-NEXT: andnq 32(%rdi), %r14, %r14
+; AVX2-NEXT: orq %r10, %r12
+; AVX2-NEXT: orq %r8, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: andnq 40(%rdi), %rdx, %rdx
+; AVX2-NEXT: orq %rbp, %rdx
+; AVX2-NEXT: shlxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT: movq -16(%rsp,%r11), %r10
+; AVX2-NEXT: shlxq %rcx, %r10, %r11
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %r10, %rax
+; AVX2-NEXT: andnq 16(%rdi), %r15, %rcx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andnq 24(%rdi), %r10, %r10
+; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: orq %r9, %r10
+; AVX2-NEXT: andnq (%rdi), %r8, %rsi
+; AVX2-NEXT: orq %r11, %rsi
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andnq 8(%rdi), %r8, %r8
+; AVX2-NEXT: orq %rax, %r8
+; AVX2-NEXT: andl $60, %ebx
+; AVX2-NEXT: movl (%rdi,%rbx), %eax
+; AVX2-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; AVX2-NEXT: btl %r9d, %eax
+; AVX2-NEXT: movq %r13, 48(%rdi)
+; AVX2-NEXT: movq %r12, 56(%rdi)
+; AVX2-NEXT: movq %r14, 32(%rdi)
+; AVX2-NEXT: movq %rdx, 40(%rdi)
+; AVX2-NEXT: movq %rcx, 16(%rdi)
+; AVX2-NEXT: movq %r10, 24(%rdi)
+; AVX2-NEXT: movq %rsi, (%rdi)
+; AVX2-NEXT: movq %r8, 8(%rdi)
+; AVX2-NEXT: setae %al
+; AVX2-NEXT: addq $184, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: init_eq_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $168, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: movl %esi, %r10d
+; AVX512-NEXT: shrl $3, %r10d
+; AVX512-NEXT: movl %r10d, %r8d
+; AVX512-NEXT: andl $56, %r8d
+; AVX512-NEXT: negl %r8d
+; AVX512-NEXT: movslq %r8d, %r9
+; AVX512-NEXT: movq 112(%rsp,%r9), %r11
+; AVX512-NEXT: movq 120(%rsp,%r9), %r14
+; AVX512-NEXT: movq %r14, %rax
+; AVX512-NEXT: shldq %cl, %r11, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 104(%rsp,%r9), %rax
+; AVX512-NEXT: shldq %cl, %rax, %r11
+; AVX512-NEXT: movq 128(%rsp,%r9), %r15
+; AVX512-NEXT: movq 136(%rsp,%r9), %rbp
+; AVX512-NEXT: movq %rbp, %rbx
+; AVX512-NEXT: shldq %cl, %r15, %rbx
+; AVX512-NEXT: shldq %cl, %r14, %r15
+; AVX512-NEXT: movq 144(%rsp,%r9), %r13
+; AVX512-NEXT: movq 152(%rsp,%r9), %r12
+; AVX512-NEXT: shldq %cl, %r13, %r12
+; AVX512-NEXT: movq 96(%rsp,%r9), %r14
+; AVX512-NEXT: shldq %cl, %rbp, %r13
+; AVX512-NEXT: shldq %cl, %r14, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movl %edx, %edx
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq 8(%rsp,%r9), %r8
+; AVX512-NEXT: movq 16(%rsp,%r9), %rax
+; AVX512-NEXT: movq %rax, %rbp
+; AVX512-NEXT: shldq %cl, %r8, %rbp
+; AVX512-NEXT: andnq 48(%rdi), %r13, %r13
+; AVX512-NEXT: orq %rbp, %r13
+; AVX512-NEXT: movq 24(%rsp,%r9), %rbp
+; AVX512-NEXT: shldq %cl, %rax, %rbp
+; AVX512-NEXT: movq -8(%rsp,%r9), %rax
+; AVX512-NEXT: movq (%rsp,%r9), %rsi
+; AVX512-NEXT: movq %rsi, %rdx
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: andnq 56(%rdi), %r12, %r12
+; AVX512-NEXT: orq %rbp, %r12
+; AVX512-NEXT: andnq 32(%rdi), %r15, %r15
+; AVX512-NEXT: orq %rdx, %r15
+; AVX512-NEXT: shldq %cl, %rsi, %r8
+; AVX512-NEXT: movq -24(%rsp,%r9), %rdx
+; AVX512-NEXT: movq -16(%rsp,%r9), %rsi
+; AVX512-NEXT: movq %rsi, %rbp
+; AVX512-NEXT: shldq %cl, %rdx, %rbp
+; AVX512-NEXT: andnq 40(%rdi), %rbx, %rbx
+; AVX512-NEXT: orq %r8, %rbx
+; AVX512-NEXT: andnq 16(%rdi), %r11, %r8
+; AVX512-NEXT: orq %rbp, %r8
+; AVX512-NEXT: shlxq %rcx, %r14, %r11
+; AVX512-NEXT: movq -32(%rsp,%r9), %r9
+; AVX512-NEXT: shldq %cl, %rsi, %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT: andnq 24(%rdi), %rsi, %rsi
+; AVX512-NEXT: orq %rax, %rsi
+; AVX512-NEXT: shlxq %rcx, %r9, %rax
+; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: shldq %cl, %r9, %rdx
+; AVX512-NEXT: andnq (%rdi), %r11, %rcx
+; AVX512-NEXT: orq %rax, %rcx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andnq 8(%rdi), %rax, %rax
+; AVX512-NEXT: orq %rdx, %rax
+; AVX512-NEXT: andl $60, %r10d
+; AVX512-NEXT: movl (%rdi,%r10), %edx
+; AVX512-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r9d # 4-byte Reload
+; AVX512-NEXT: btl %r9d, %edx
+; AVX512-NEXT: movq %r13, 48(%rdi)
+; AVX512-NEXT: movq %r12, 56(%rdi)
+; AVX512-NEXT: movq %r15, 32(%rdi)
+; AVX512-NEXT: movq %rbx, 40(%rdi)
+; AVX512-NEXT: movq %r8, 16(%rdi)
+; AVX512-NEXT: movq %rsi, 24(%rdi)
+; AVX512-NEXT: movq %rcx, (%rdi)
+; AVX512-NEXT: movq %rax, 8(%rdi)
+; AVX512-NEXT: setae %al
+; AVX512-NEXT: addq $168, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -1049,40 +1841,33 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 36(%esp,%edi), %edx
-; X86-NEXT: movl 40(%esp,%edi), %ebx
-; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 40(%esp,%eax), %edx
+; X86-NEXT: movl 44(%esp,%eax), %esi
; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 32(%esp,%edi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esp,%edi), %edi
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: movl 32(%esp,%eax), %edi
+; X86-NEXT: movl 36(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: shldl %cl, %edi, %ebx
; X86-NEXT: notl %ebx
; X86-NEXT: movl 16(%ebp), %eax
; X86-NEXT: movl (%eax), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx,%eax), %eax
-; X86-NEXT: andl %ebx, (%ecx)
-; X86-NEXT: movl 12(%ebp), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
-; X86-NEXT: notl %edx
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl %edx, 4(%ebx)
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %esi, 8(%ebx)
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: andl %ebx, 4(%eax)
+; X86-NEXT: shll %cl, %edi
; X86-NEXT: notl %edi
-; X86-NEXT: andl %edi, 12(%ebx)
-; X86-NEXT: btl %ecx, %eax
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl $96, %ebx
+; X86-NEXT: shrl $3, %ebx
+; X86-NEXT: movl (%eax,%ebx), %ebx
+; X86-NEXT: andl %edi, (%eax)
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %esi, 12(%eax)
+; X86-NEXT: notl %edx
+; X86-NEXT: andl %edx, 8(%eax)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: btl %ecx, %ebx
; X86-NEXT: jae .LBB22_2
; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
@@ -1116,8 +1901,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; SSE-NEXT: # %bb.1:
; SSE-NEXT: movl (%rdx), %eax
; SSE-NEXT: .LBB22_2:
-; SSE-NEXT: andq %r8, 8(%rdi)
; SSE-NEXT: andq %rsi, (%rdi)
+; SSE-NEXT: andq %r8, 8(%rdi)
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: retq
;
@@ -1143,8 +1928,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; AVX2-NEXT: # %bb.1:
; AVX2-NEXT: movl (%rdx), %eax
; AVX2-NEXT: .LBB22_2:
-; AVX2-NEXT: andq %rsi, 8(%rdi)
; AVX2-NEXT: andq %r8, (%rdi)
+; AVX2-NEXT: andq %rsi, 8(%rdi)
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: retq
;
@@ -1170,8 +1955,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; AVX512-NEXT: # %bb.1:
; AVX512-NEXT: movl (%rdx), %eax
; AVX512-NEXT: .LBB22_2:
-; AVX512-NEXT: andq %rsi, 8(%rdi)
; AVX512-NEXT: andq %r8, (%rdi)
+; AVX512-NEXT: andq %rsi, 8(%rdi)
; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
; AVX512-NEXT: retq
%rem = and i32 %position, 127
diff --git a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
index 065710f..8576f8f 100644
--- a/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
+++ b/llvm/test/CodeGen/X86/isel-llvm.sincos.ll
@@ -3,6 +3,9 @@
; RUN: llc < %s -mtriple=x86_64-linux-gnu -fast-isel | FileCheck %s --check-prefixes=X64,FASTISEL-X64
; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X86,SDAG-X86
; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=0 -fast-isel=0 | FileCheck %s --check-prefixes=X64,SDAG-X64
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-SINCOS-STRET
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=MACOS-NOSINCOS-STRET
+
; TODO: The below RUN line will fails GISEL selection and will fallback to DAG selection due to lack of support for loads/stores in i686 mode, support is expected soon enough, for this reason the llvm/test/CodeGen/X86/GlobalISel/llvm.sincos.mir test is added for now because of the lack of support for i686 in GlobalISel.
; RUN: llc < %s -mtriple=i686-linux-gnu -global-isel=1 -global-isel-abort=2 | FileCheck %s --check-prefixes=GISEL-X86
; RUN: llc < %s -mtriple=x86_64-linux-gnu -global-isel=1 -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64
@@ -34,6 +37,29 @@ define { float, float } @test_sincos_f32(float %Val) nounwind {
; X64-NEXT: popq %rax
; X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %rax
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT: popq %rax
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f32:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: popq %rax
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: test_sincos_f32:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: subl $28, %esp
@@ -93,6 +119,28 @@ define { double, double } @test_sincos_f64(double %Val) nounwind {
; X64-NEXT: addq $24, %rsp
; X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %rax
+; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT: popq %rax
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f64:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: subq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _sin
+; MACOS-NOSINCOS-STRET-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT: callq _cos
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, %xmm1
+; MACOS-NOSINCOS-STRET-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 8-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero
+; MACOS-NOSINCOS-STRET-NEXT: addq $24, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: test_sincos_f64:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: subl $44, %esp
@@ -153,6 +201,40 @@ define { x86_fp80, x86_fp80 } @test_sincos_f80(x86_fp80 %Val) nounwind {
; X64-NEXT: addq $56, %rsp
; X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp)
+; MACOS-SINCOS-STRET-NEXT: fld %st(0)
+; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT: callq _cosl
+; MACOS-SINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-SINCOS-STRET-NEXT: callq _sinl
+; MACOS-SINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: fxch %st(1)
+; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_f80:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: subq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: fldt {{[0-9]+}}(%rsp)
+; MACOS-NOSINCOS-STRET-NEXT: fld %st(0)
+; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosl
+; MACOS-NOSINCOS-STRET-NEXT: fstpt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Spill
+; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: fstpt (%rsp)
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinl
+; MACOS-NOSINCOS-STRET-NEXT: fldt {{[-0-9]+}}(%r{{[sb]}}p) ## 10-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: fxch %st(1)
+; MACOS-NOSINCOS-STRET-NEXT: addq $40, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: test_sincos_f80:
; GISEL-X86: # %bb.0:
; GISEL-X86-NEXT: subl $60, %esp
@@ -288,6 +370,57 @@ define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %
; SDAG-X64-NEXT: popq %r14
; SDAG-X64-NEXT: retq
;
+; MACOS-SINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-SINCOS-STRET: ## %bb.0: ## %entry
+; MACOS-SINCOS-STRET-NEXT: pushq %r14
+; MACOS-SINCOS-STRET-NEXT: pushq %rbx
+; MACOS-SINCOS-STRET-NEXT: subq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movq %r14, %rdi
+; MACOS-SINCOS-STRET-NEXT: movq %rbx, %rsi
+; MACOS-SINCOS-STRET-NEXT: callq _foo
+; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%r14)
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movss %xmm0, (%rbx)
+; MACOS-SINCOS-STRET-NEXT: addq $40, %rsp
+; MACOS-SINCOS-STRET-NEXT: popq %rbx
+; MACOS-SINCOS-STRET-NEXT: popq %r14
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: can_fold_with_call_in_chain:
+; MACOS-NOSINCOS-STRET: ## %bb.0: ## %entry
+; MACOS-NOSINCOS-STRET-NEXT: pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rax
+; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rsp) ## 4-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movq %r14, %rdi
+; MACOS-NOSINCOS-STRET-NEXT: movq %rbx, %rsi
+; MACOS-NOSINCOS-STRET-NEXT: callq _foo
+; MACOS-NOSINCOS-STRET-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT: movss (%rsp), %xmm0 ## 4-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = mem[0],zero,zero,zero
+; MACOS-NOSINCOS-STRET-NEXT: movss %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT: addq $8, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: popq %r14
+; MACOS-NOSINCOS-STRET-NEXT: retq
+;
; GISEL-X86-LABEL: can_fold_with_call_in_chain:
; GISEL-X86: # %bb.0: # %entry
; GISEL-X86-NEXT: pushl %ebx
diff --git a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
index 834dd78..9b02438 100644
--- a/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
+++ b/llvm/test/CodeGen/X86/llvm.sincos.vec.ll
@@ -1,59 +1,213 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 5
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck -check-prefix=X86 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck -check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 | FileCheck --check-prefix=MACOS-SINCOS-STRET %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 | FileCheck --check-prefix=MACOS-NOSINCOS-STRET %s
define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v4f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $52, %esp
-; CHECK-NEXT: movl 84(%esp), %esi
-; CHECK-NEXT: flds 76(%esp)
-; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: flds 64(%esp)
-; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: flds 72(%esp)
-; CHECK-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; CHECK-NEXT: flds 68(%esp)
-; CHECK-NEXT: movl 80(%esp), %edi
-; CHECK-NEXT: leal 40(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: leal 4(%edi), %eax
-; CHECK-NEXT: movl %eax, 4(%esp)
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: leal 44(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: leal 8(%edi), %eax
-; CHECK-NEXT: movl %eax, 4(%esp)
-; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: leal 36(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: movl %edi, 4(%esp)
-; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: leal 48(%esp), %eax
-; CHECK-NEXT: movl %eax, 8(%esp)
-; CHECK-NEXT: addl $12, %edi
-; CHECK-NEXT: movl %edi, 4(%esp)
-; CHECK-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; CHECK-NEXT: fstps (%esp)
-; CHECK-NEXT: calll sincosf
-; CHECK-NEXT: flds 36(%esp)
-; CHECK-NEXT: flds 40(%esp)
-; CHECK-NEXT: flds 44(%esp)
-; CHECK-NEXT: flds 48(%esp)
-; CHECK-NEXT: fstps 12(%esi)
-; CHECK-NEXT: fstps 8(%esi)
-; CHECK-NEXT: fstps 4(%esi)
-; CHECK-NEXT: fstps (%esi)
-; CHECK-NEXT: addl $52, %esp
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: retl
+; X86-LABEL: test_sincos_v4f32:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $52, %esp
+; X86-NEXT: movl 84(%esp), %esi
+; X86-NEXT: flds 76(%esp)
+; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: flds 64(%esp)
+; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: flds 72(%esp)
+; X86-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: flds 68(%esp)
+; X86-NEXT: movl 80(%esp), %edi
+; X86-NEXT: leal 40(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: leal 4(%edi), %eax
+; X86-NEXT: movl %eax, 4(%esp)
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: leal 44(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: leal 8(%edi), %eax
+; X86-NEXT: movl %eax, 4(%esp)
+; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: leal 36(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: movl %edi, 4(%esp)
+; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: leal 48(%esp), %eax
+; X86-NEXT: movl %eax, 8(%esp)
+; X86-NEXT: addl $12, %edi
+; X86-NEXT: movl %edi, 4(%esp)
+; X86-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: fstps (%esp)
+; X86-NEXT: calll sincosf
+; X86-NEXT: flds 36(%esp)
+; X86-NEXT: flds 40(%esp)
+; X86-NEXT: flds 44(%esp)
+; X86-NEXT: flds 48(%esp)
+; X86-NEXT: fstps 12(%esi)
+; X86-NEXT: fstps 8(%esi)
+; X86-NEXT: fstps 4(%esi)
+; X86-NEXT: fstps (%esi)
+; X86-NEXT: addl $52, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: test_sincos_v4f32:
+; X64: # %bb.0:
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $56, %rsp
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X64-NEXT: leaq 4(%rsp), %rdi
+; X64-NEXT: movq %rsp, %rsi
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: leaq 12(%rsp), %rdi
+; X64-NEXT: leaq 8(%rsp), %rsi
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: leaq 28(%rsp), %rdi
+; X64-NEXT: leaq 24(%rsp), %rsi
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X64-NEXT: leaq 20(%rsp), %rdi
+; X64-NEXT: leaq 16(%rsp), %rsi
+; X64-NEXT: callq sincosf@PLT
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; X64-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; X64-NEXT: movups %xmm1, (%r14)
+; X64-NEXT: movups %xmm0, (%rbx)
+; X64-NEXT: addq $56, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r14
+; X64-NEXT: retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %r14
+; MACOS-SINCOS-STRET-NEXT: pushq %rbx
+; MACOS-SINCOS-STRET-NEXT: subq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: unpcklps (%rsp), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, %xmm1
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincosf_stret
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; MACOS-SINCOS-STRET-NEXT: unpcklpd (%rsp), %xmm2 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: ## xmm2 = xmm2[0],mem[0]
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-SINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT: addq $104, %rsp
+; MACOS-SINCOS-STRET-NEXT: popq %rbx
+; MACOS-SINCOS-STRET-NEXT: popq %r14
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v4f32:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: subq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cosf
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sinf
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; MACOS-NOSINCOS-STRET-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Folded Reload
+; MACOS-NOSINCOS-STRET-NEXT: ## xmm1 = xmm1[0],mem[0]
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT: addq $104, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: popq %r14
+; MACOS-NOSINCOS-STRET-NEXT: retq
%result = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> %x)
%result.0 = extractvalue { <4 x float>, <4 x float> } %result, 0
%result.1 = extractvalue { <4 x float>, <4 x float> } %result, 1
@@ -63,36 +217,120 @@ define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias
}
define void @test_sincos_v2f64(<2 x double> %x, ptr noalias %out_sin, ptr noalias %out_cos) nounwind {
-; CHECK-LABEL: test_sincos_v2f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: pushl %edi
-; CHECK-NEXT: pushl %esi
-; CHECK-NEXT: subl $52, %esp
-; CHECK-NEXT: movl 84(%esp), %esi
-; CHECK-NEXT: fldl 72(%esp)
-; CHECK-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
-; CHECK-NEXT: fldl 64(%esp)
-; CHECK-NEXT: movl 80(%esp), %edi
-; CHECK-NEXT: leal 24(%esp), %eax
-; CHECK-NEXT: movl %eax, 12(%esp)
-; CHECK-NEXT: movl %edi, 8(%esp)
-; CHECK-NEXT: fstpl (%esp)
-; CHECK-NEXT: calll sincos
-; CHECK-NEXT: leal 32(%esp), %eax
-; CHECK-NEXT: movl %eax, 12(%esp)
-; CHECK-NEXT: addl $8, %edi
-; CHECK-NEXT: movl %edi, 8(%esp)
-; CHECK-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
-; CHECK-NEXT: fstpl (%esp)
-; CHECK-NEXT: calll sincos
-; CHECK-NEXT: fldl 24(%esp)
-; CHECK-NEXT: fldl 32(%esp)
-; CHECK-NEXT: fstpl 8(%esi)
-; CHECK-NEXT: fstpl (%esi)
-; CHECK-NEXT: addl $52, %esp
-; CHECK-NEXT: popl %esi
-; CHECK-NEXT: popl %edi
-; CHECK-NEXT: retl
+; X86-LABEL: test_sincos_v2f64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: subl $52, %esp
+; X86-NEXT: movl 84(%esp), %esi
+; X86-NEXT: fldl 72(%esp)
+; X86-NEXT: fstpl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Spill
+; X86-NEXT: fldl 64(%esp)
+; X86-NEXT: movl 80(%esp), %edi
+; X86-NEXT: leal 24(%esp), %eax
+; X86-NEXT: movl %eax, 12(%esp)
+; X86-NEXT: movl %edi, 8(%esp)
+; X86-NEXT: fstpl (%esp)
+; X86-NEXT: calll sincos
+; X86-NEXT: leal 32(%esp), %eax
+; X86-NEXT: movl %eax, 12(%esp)
+; X86-NEXT: addl $8, %edi
+; X86-NEXT: movl %edi, 8(%esp)
+; X86-NEXT: fldl {{[-0-9]+}}(%e{{[sb]}}p) # 8-byte Folded Reload
+; X86-NEXT: fstpl (%esp)
+; X86-NEXT: calll sincos
+; X86-NEXT: fldl 24(%esp)
+; X86-NEXT: fldl 32(%esp)
+; X86-NEXT: fstpl 8(%esi)
+; X86-NEXT: fstpl (%esi)
+; X86-NEXT: addl $52, %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: retl
+;
+; X64-LABEL: test_sincos_v2f64:
+; X64: # %bb.0:
+; X64-NEXT: pushq %r14
+; X64-NEXT: pushq %rbx
+; X64-NEXT: subq $56, %rsp
+; X64-NEXT: movq %rsi, %rbx
+; X64-NEXT: movq %rdi, %r14
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; X64-NEXT: leaq 24(%rsp), %rdi
+; X64-NEXT: leaq 16(%rsp), %rsi
+; X64-NEXT: callq sincos@PLT
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-NEXT: leaq 8(%rsp), %rdi
+; X64-NEXT: movq %rsp, %rsi
+; X64-NEXT: callq sincos@PLT
+; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT: movhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; X64-NEXT: movups %xmm1, (%r14)
+; X64-NEXT: movups %xmm0, (%rbx)
+; X64-NEXT: addq $56, %rsp
+; X64-NEXT: popq %rbx
+; X64-NEXT: popq %r14
+; X64-NEXT: retq
+;
+; MACOS-SINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-SINCOS-STRET: ## %bb.0:
+; MACOS-SINCOS-STRET-NEXT: pushq %r14
+; MACOS-SINCOS-STRET-NEXT: pushq %rbx
+; MACOS-SINCOS-STRET-NEXT: subq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-SINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-SINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-SINCOS-STRET-NEXT: callq ___sincos_stret
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; MACOS-SINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-SINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-SINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-SINCOS-STRET-NEXT: movups %xmm2, (%rbx)
+; MACOS-SINCOS-STRET-NEXT: addq $56, %rsp
+; MACOS-SINCOS-STRET-NEXT: popq %rbx
+; MACOS-SINCOS-STRET-NEXT: popq %r14
+; MACOS-SINCOS-STRET-NEXT: retq
+;
+; MACOS-NOSINCOS-STRET-LABEL: test_sincos_v2f64:
+; MACOS-NOSINCOS-STRET: ## %bb.0:
+; MACOS-NOSINCOS-STRET-NEXT: pushq %r14
+; MACOS-NOSINCOS-STRET-NEXT: pushq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: subq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: movq %rsi, %rbx
+; MACOS-NOSINCOS-STRET-NEXT: movq %rdi, %r14
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cos
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: callq _cos
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sin
+; MACOS-NOSINCOS-STRET-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: callq _sin
+; MACOS-NOSINCOS-STRET-NEXT: movaps (%rsp), %xmm1 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm1, (%r14)
+; MACOS-NOSINCOS-STRET-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; MACOS-NOSINCOS-STRET-NEXT: movups %xmm0, (%rbx)
+; MACOS-NOSINCOS-STRET-NEXT: addq $56, %rsp
+; MACOS-NOSINCOS-STRET-NEXT: popq %rbx
+; MACOS-NOSINCOS-STRET-NEXT: popq %r14
+; MACOS-NOSINCOS-STRET-NEXT: retq
%result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %x)
%result.0 = extractvalue { <2 x double>, <2 x double> } %result, 0
%result.1 = extractvalue { <2 x double>, <2 x double> } %result, 1