; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK0 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK1 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE2,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK4 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK5 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK6 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-SSE42,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK7 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK8 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK9 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK10 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK11 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,FALLBACK12 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,FALLBACK13 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,FALLBACK14 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,FALLBACK15 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK16 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK17 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK18 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK19 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK20 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK21 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK22 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse4.2,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE42,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK23 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK24 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK25 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK26 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK27 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,FALLBACK28 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,FALLBACK29 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,FALLBACK30 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,FALLBACK31 define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: lshr_4bytes: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movl (%rdi), %eax ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-BMI2-NEXT: shlb $3, %cl ; X64-NO-BMI2-NEXT: shrl %cl, %eax ; X64-NO-BMI2-NEXT: movl %eax, (%rdx) ; X64-NO-BMI2-NEXT: retq ; ; X64-HAVE-BMI2-LABEL: lshr_4bytes: ; X64-HAVE-BMI2: # %bb.0: ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax ; X64-HAVE-BMI2-NEXT: shlb $3, %al ; X64-HAVE-BMI2-NEXT: shrxl %eax, (%rdi), %eax ; X64-HAVE-BMI2-NEXT: movl %eax, (%rdx) ; X64-HAVE-BMI2-NEXT: retq ; ; X86-NO-BMI2-LABEL: lshr_4bytes: ; X86-NO-BMI2: # %bb.0: ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NEXT: movl (%edx), %edx ; X86-NO-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-HAVE-BMI2-LABEL: lshr_4bytes: ; X86-HAVE-BMI2: # %bb.0: ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NEXT: movzbl (%edx), %edx ; X86-HAVE-BMI2-NEXT: shlb $3, %dl ; X86-HAVE-BMI2-NEXT: shrxl %edx, (%ecx), %ecx ; X86-HAVE-BMI2-NEXT: movl %ecx, (%eax) ; X86-HAVE-BMI2-NEXT: retl %src = load i32, ptr %src.ptr, align 1 %byteOff = load i32, ptr %byteOff.ptr, align 1 %bitOff = shl i32 %byteOff, 3 %res = lshr i32 %src, %bitOff store i32 %res, ptr %dst, align 1 ret void } define void @shl_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: shl_4bytes: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movl (%rdi), %eax ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-BMI2-NEXT: shlb $3, %cl ; X64-NO-BMI2-NEXT: shll %cl, %eax ; X64-NO-BMI2-NEXT: movl %eax, (%rdx) ; X64-NO-BMI2-NEXT: retq ; ; X64-HAVE-BMI2-LABEL: shl_4bytes: ; X64-HAVE-BMI2: # %bb.0: ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax ; X64-HAVE-BMI2-NEXT: shlb $3, %al ; X64-HAVE-BMI2-NEXT: shlxl %eax, (%rdi), %eax ; X64-HAVE-BMI2-NEXT: movl %eax, (%rdx) ; X64-HAVE-BMI2-NEXT: retq ; ; X86-NO-BMI2-LABEL: shl_4bytes: ; X86-NO-BMI2: # %bb.0: ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NEXT: movl (%edx), %edx ; X86-NO-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: shll %cl, %edx ; X86-NO-BMI2-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-HAVE-BMI2-LABEL: shl_4bytes: ; X86-HAVE-BMI2: # %bb.0: ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NEXT: movzbl (%edx), %edx ; X86-HAVE-BMI2-NEXT: shlb $3, %dl ; X86-HAVE-BMI2-NEXT: shlxl %edx, (%ecx), %ecx ; X86-HAVE-BMI2-NEXT: movl %ecx, (%eax) ; X86-HAVE-BMI2-NEXT: retl %src = load i32, ptr %src.ptr, align 1 %byteOff = load i32, ptr %byteOff.ptr, align 1 %bitOff = shl i32 %byteOff, 3 %res = shl i32 %src, %bitOff store i32 %res, ptr %dst, align 1 ret void } define void @ashr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: ashr_4bytes: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movl (%rdi), %eax ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-BMI2-NEXT: shlb $3, %cl ; X64-NO-BMI2-NEXT: sarl %cl, %eax ; X64-NO-BMI2-NEXT: movl %eax, (%rdx) ; X64-NO-BMI2-NEXT: retq ; ; X64-HAVE-BMI2-LABEL: ashr_4bytes: ; X64-HAVE-BMI2: # %bb.0: ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax ; X64-HAVE-BMI2-NEXT: shlb $3, %al ; X64-HAVE-BMI2-NEXT: sarxl %eax, (%rdi), %eax ; X64-HAVE-BMI2-NEXT: movl %eax, (%rdx) ; X64-HAVE-BMI2-NEXT: retq ; ; X86-NO-BMI2-LABEL: ashr_4bytes: ; X86-NO-BMI2: # %bb.0: ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NEXT: movl (%edx), %edx ; X86-NO-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-NO-BMI2-NEXT: shlb $3, %cl ; X86-NO-BMI2-NEXT: sarl %cl, %edx ; X86-NO-BMI2-NEXT: movl %edx, (%eax) ; X86-NO-BMI2-NEXT: retl ; ; X86-HAVE-BMI2-LABEL: ashr_4bytes: ; X86-HAVE-BMI2: # %bb.0: ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-BMI2-NEXT: movzbl (%edx), %edx ; X86-HAVE-BMI2-NEXT: shlb $3, %dl ; X86-HAVE-BMI2-NEXT: sarxl %edx, (%ecx), %ecx ; X86-HAVE-BMI2-NEXT: movl %ecx, (%eax) ; X86-HAVE-BMI2-NEXT: retl %src = load i32, ptr %src.ptr, align 1 %byteOff = load i32, ptr %byteOff.ptr, align 1 %bitOff = shl i32 %byteOff, 3 %res = ashr i32 %src, %bitOff store i32 %res, ptr %dst, align 1 ret void } define void @lshr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: lshr_8bytes: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-BMI2-NEXT: shlb $3, %cl ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movq %rax, (%rdx) ; X64-NO-BMI2-NEXT: retq ; ; X64-HAVE-BMI2-LABEL: lshr_8bytes: ; X64-HAVE-BMI2: # %bb.0: ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax ; X64-HAVE-BMI2-NEXT: shlb $3, %al ; X64-HAVE-BMI2-NEXT: shrxq %rax, (%rdi), %rax ; X64-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-NEXT: retq ; ; X86-NO-SHLD-NO-BMI2-LABEL: lshr_8bytes: ; X86-NO-SHLD-NO-BMI2: # %bb.0: ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %edi ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al ; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %esi, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %esi, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, 4(%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, (%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: retl ; ; X86-HAVE-SHLD-NO-BMI2-LABEL: lshr_8bytes: ; X86-HAVE-SHLD-NO-BMI2: # %bb.0: ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esi), %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esi), %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrl %cl, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: xorl %esi, %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: testb $32, %cl ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 4(%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: retl ; ; X86-NO-SHLD-HAVE-BMI2-LABEL: lshr_8bytes: ; X86-NO-SHLD-HAVE-BMI2: # %bb.0: ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, (%edx), %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %bl ; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%esi,%esi), %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %ebx, %edi, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %edx, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: testb $32, %cl ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovnel %edx, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovel %edx, %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, 4(%eax) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%eax) ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: retl ; ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_8bytes: ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esi), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrxl %ecx, %esi, %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%eax) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl %src = load i64, ptr %src.ptr, align 1 %byteOff = load i64, ptr %byteOff.ptr, align 1 %bitOff = shl i64 %byteOff, 3 %res = lshr i64 %src, %bitOff store i64 %res, ptr %dst, align 1 ret void } define void @shl_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: shl_8bytes: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-BMI2-NEXT: shlb $3, %cl ; X64-NO-BMI2-NEXT: shlq %cl, %rax ; X64-NO-BMI2-NEXT: movq %rax, (%rdx) ; X64-NO-BMI2-NEXT: retq ; ; X64-HAVE-BMI2-LABEL: shl_8bytes: ; X64-HAVE-BMI2: # %bb.0: ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax ; X64-HAVE-BMI2-NEXT: shlb $3, %al ; X64-HAVE-BMI2-NEXT: shlxq %rax, (%rdi), %rax ; X64-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-NEXT: retq ; ; X86-NO-SHLD-NO-BMI2-LABEL: shl_8bytes: ; X86-NO-SHLD-NO-BMI2: # %bb.0: ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al ; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %esi, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %esi, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, (%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, 4(%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: retl ; ; X86-HAVE-SHLD-NO-BMI2-LABEL: shl_8bytes: ; X86-HAVE-SHLD-NO-BMI2: # %bb.0: ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: shll %cl, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: shldl %cl, %esi, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: xorl %esi, %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: testb $32, %cl ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, 4(%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: retl ; ; X86-NO-SHLD-HAVE-BMI2-LABEL: shl_8bytes: ; X86-NO-SHLD-HAVE-BMI2: # %bb.0: ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %ecx, 4(%edx), %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %bl ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %ecx, %esi, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %ebx, %esi, %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %edx, %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: xorl %edx, %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: testb $32, %cl ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovnel %edi, %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovel %edi, %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, 4(%eax) ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: retl ; ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: shl_8bytes: ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shldl %cl, %esi, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlxl %ecx, %esi, %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, 4(%eax) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, (%eax) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl %src = load i64, ptr %src.ptr, align 1 %byteOff = load i64, ptr %byteOff.ptr, align 1 %bitOff = shl i64 %byteOff, 3 %res = shl i64 %src, %bitOff store i64 %res, ptr %dst, align 1 ret void } define void @ashr_8bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-LABEL: ashr_8bytes: ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-BMI2-NEXT: shlb $3, %cl ; X64-NO-BMI2-NEXT: sarq %cl, %rax ; X64-NO-BMI2-NEXT: movq %rax, (%rdx) ; X64-NO-BMI2-NEXT: retq ; ; X64-HAVE-BMI2-LABEL: ashr_8bytes: ; X64-HAVE-BMI2: # %bb.0: ; X64-HAVE-BMI2-NEXT: movzbl (%rsi), %eax ; X64-HAVE-BMI2-NEXT: shlb $3, %al ; X64-HAVE-BMI2-NEXT: sarxq %rax, (%rdi), %rax ; X64-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-BMI2-NEXT: retq ; ; X86-NO-SHLD-NO-BMI2-LABEL: ashr_8bytes: ; X86-NO-SHLD-NO-BMI2: # %bb.0: ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl (%eax), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: leal (%esi,%esi), %edi ; X86-NO-SHLD-NO-BMI2-NEXT: notb %cl ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: sarl $31, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: testb $32, %al ; X86-NO-SHLD-NO-BMI2-NEXT: cmovnel %ebx, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: cmovel %ebx, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, 4(%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, (%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: retl ; ; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_8bytes: ; X86-HAVE-SHLD-NO-BMI2: # %bb.0: ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esi), %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esi), %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl %cl, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: testb $32, %cl ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovnel %edi, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: cmovel %edi, %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 4(%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: retl ; ; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_8bytes: ; X86-NO-SHLD-HAVE-BMI2: # %bb.0: ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%edx), %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %dl ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %edx, (%esi), %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %bl ; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ecx,%ecx), %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %ebx, %edi, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %esi, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %edx, %ecx, %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: sarl $31, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: testb $32, %dl ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovnel %esi, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: cmovel %esi, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%eax) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%eax) ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: retl ; ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_8bytes: ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esi), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esi), %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %esi, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: testb $32, %cl ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovnel %edi, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: cmovel %edi, %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, 4(%eax) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%eax) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl %src = load i64, ptr %src.ptr, align 1 %byteOff = load i64, ptr %byteOff.ptr, align 1 %bitOff = shl i64 %byteOff, 3 %res = ashr i64 %src, %bitOff store i64 %res, ptr %dst, align 1 ret void } define void @lshr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes: ; X64-NO-SHLD-NO-BMI2: # %bb.0: ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al ; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %rdi, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %rdi, %rcx ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, 8(%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes: ; X64-HAVE-SHLD-NO-BMI2: # %bb.0: ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %edi, %edi ; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: retq ; ; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes: ; X64-NO-SHLD-HAVE-BMI2: # %bb.0: ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi ; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil ; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rax, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi ; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, 8(%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes: ; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rdi, %rsi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; ; FALLBACK16-LABEL: lshr_16bytes: ; FALLBACK16: # %bb.0: ; FALLBACK16-NEXT: pushl %ebp ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $60, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK16-NEXT: movl (%ecx), %edx ; FALLBACK16-NEXT: movl 4(%ecx), %esi ; FALLBACK16-NEXT: movl 8(%ecx), %edi ; FALLBACK16-NEXT: movl 12(%ecx), %ecx ; FALLBACK16-NEXT: movb (%eax), %ah ; FALLBACK16-NEXT: movb %ah, %al ; FALLBACK16-NEXT: shlb $3, %al ; FALLBACK16-NEXT: xorps %xmm0, %xmm0 ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: andb $12, %ah ; FALLBACK16-NEXT: movzbl %ah, %ebp ; FALLBACK16-NEXT: movl 20(%esp,%ebp), %esi ; FALLBACK16-NEXT: movl %esi, %ebx ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movl %eax, %edx ; FALLBACK16-NEXT: notb %dl ; FALLBACK16-NEXT: movl 24(%esp,%ebp), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: leal (%ecx,%ecx), %edi ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: orl %ebx, %edi ; FALLBACK16-NEXT: movl 16(%esp,%ebp), %ebx ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: orl %ebx, %esi ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK16-NEXT: movl 28(%esp,%ebp), %ebx ; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movl %ebx, 12(%edx) ; FALLBACK16-NEXT: movl %ebp, 8(%edx) ; FALLBACK16-NEXT: movl %esi, (%edx) ; FALLBACK16-NEXT: movl %edi, 4(%edx) ; FALLBACK16-NEXT: addl $60, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx ; FALLBACK16-NEXT: popl %ebp ; FALLBACK16-NEXT: retl ; ; FALLBACK17-LABEL: lshr_16bytes: ; FALLBACK17: # %bb.0: ; FALLBACK17-NEXT: pushl %ebp ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $44, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK17-NEXT: movl (%edx), %esi ; FALLBACK17-NEXT: movl 4(%edx), %edi ; FALLBACK17-NEXT: movl 8(%edx), %ebx ; FALLBACK17-NEXT: movl 12(%edx), %edx ; FALLBACK17-NEXT: movb (%ecx), %ch ; FALLBACK17-NEXT: movb %ch, %cl ; FALLBACK17-NEXT: shlb $3, %cl ; FALLBACK17-NEXT: xorps %xmm0, %xmm0 ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, (%esp) ; FALLBACK17-NEXT: andb $12, %ch ; FALLBACK17-NEXT: movzbl %ch, %ebx ; FALLBACK17-NEXT: movl 8(%esp,%ebx), %esi ; FALLBACK17-NEXT: movl (%esp,%ebx), %edx ; FALLBACK17-NEXT: movl 4(%esp,%ebx), %ebp ; FALLBACK17-NEXT: movl %ebp, %edi ; FALLBACK17-NEXT: shrdl %cl, %esi, %edi ; FALLBACK17-NEXT: movl 12(%esp,%ebx), %ebx ; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK17-NEXT: shrdl %cl, %ebp, %edx ; FALLBACK17-NEXT: shrl %cl, %ebx ; FALLBACK17-NEXT: movl %esi, 8(%eax) ; FALLBACK17-NEXT: movl %ebx, 12(%eax) ; FALLBACK17-NEXT: movl %edx, (%eax) ; FALLBACK17-NEXT: movl %edi, 4(%eax) ; FALLBACK17-NEXT: addl $44, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx ; FALLBACK17-NEXT: popl %ebp ; FALLBACK17-NEXT: retl ; ; FALLBACK18-LABEL: lshr_16bytes: ; FALLBACK18: # %bb.0: ; FALLBACK18-NEXT: pushl %ebp ; FALLBACK18-NEXT: pushl %ebx ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $44, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK18-NEXT: movl (%ecx), %edx ; FALLBACK18-NEXT: movl 4(%ecx), %esi ; FALLBACK18-NEXT: movl 8(%ecx), %edi ; FALLBACK18-NEXT: movl 12(%ecx), %ecx ; FALLBACK18-NEXT: movzbl (%eax), %ebx ; FALLBACK18-NEXT: movl %ebx, %eax ; FALLBACK18-NEXT: shlb $3, %al ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, (%esp) ; FALLBACK18-NEXT: andb $12, %bl ; FALLBACK18-NEXT: movzbl %bl, %esi ; FALLBACK18-NEXT: movl 4(%esp,%esi), %edi ; FALLBACK18-NEXT: movl 8(%esp,%esi), %ebx ; FALLBACK18-NEXT: shrxl %eax, %edi, %ebp ; FALLBACK18-NEXT: movl %eax, %edx ; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%ebx,%ebx), %ecx ; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx ; FALLBACK18-NEXT: orl %ebp, %ecx ; FALLBACK18-NEXT: shrxl %eax, (%esp,%esi), %ebp ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %ebp, %edi ; FALLBACK18-NEXT: shrxl %eax, %ebx, %ebx ; FALLBACK18-NEXT: movl 12(%esp,%esi), %esi ; FALLBACK18-NEXT: shrxl %eax, %esi, %eax ; FALLBACK18-NEXT: addl %esi, %esi ; FALLBACK18-NEXT: shlxl %edx, %esi, %edx ; FALLBACK18-NEXT: orl %ebx, %edx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK18-NEXT: movl %eax, 12(%esi) ; FALLBACK18-NEXT: movl %edx, 8(%esi) ; FALLBACK18-NEXT: movl %edi, (%esi) ; FALLBACK18-NEXT: movl %ecx, 4(%esi) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi ; FALLBACK18-NEXT: popl %ebx ; FALLBACK18-NEXT: popl %ebp ; FALLBACK18-NEXT: retl ; ; FALLBACK19-LABEL: lshr_16bytes: ; FALLBACK19: # %bb.0: ; FALLBACK19-NEXT: pushl %ebp ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $44, %esp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK19-NEXT: movl (%edx), %esi ; FALLBACK19-NEXT: movl 4(%edx), %edi ; FALLBACK19-NEXT: movl 8(%edx), %ebx ; FALLBACK19-NEXT: movl 12(%edx), %edx ; FALLBACK19-NEXT: movzbl (%ecx), %eax ; FALLBACK19-NEXT: movl %eax, %ecx ; FALLBACK19-NEXT: shlb $3, %cl ; FALLBACK19-NEXT: xorps %xmm0, %xmm0 ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, (%esp) ; FALLBACK19-NEXT: andb $12, %al ; FALLBACK19-NEXT: movzbl %al, %eax ; FALLBACK19-NEXT: movl 8(%esp,%eax), %ebx ; FALLBACK19-NEXT: movl (%esp,%eax), %edx ; FALLBACK19-NEXT: movl 4(%esp,%eax), %esi ; FALLBACK19-NEXT: movl %esi, %edi ; FALLBACK19-NEXT: shrdl %cl, %ebx, %edi ; FALLBACK19-NEXT: movl 12(%esp,%eax), %eax ; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK19-NEXT: movl %ebx, 8(%ebp) ; FALLBACK19-NEXT: shrxl %ecx, %eax, %eax ; FALLBACK19-NEXT: movl %eax, 12(%ebp) ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK19-NEXT: shrdl %cl, %esi, %edx ; FALLBACK19-NEXT: movl %edx, (%ebp) ; FALLBACK19-NEXT: movl %edi, 4(%ebp) ; FALLBACK19-NEXT: addl $44, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx ; FALLBACK19-NEXT: popl %ebp ; FALLBACK19-NEXT: retl ; ; FALLBACK20-LABEL: lshr_16bytes: ; FALLBACK20: # %bb.0: ; FALLBACK20-NEXT: pushl %ebp ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi ; FALLBACK20-NEXT: subl $60, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movzbl (%eax), %ecx ; FALLBACK20-NEXT: movl %ecx, %eax ; FALLBACK20-NEXT: shlb $3, %al ; FALLBACK20-NEXT: xorps %xmm1, %xmm1 ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $12, %cl ; FALLBACK20-NEXT: movzbl %cl, %edi ; FALLBACK20-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK20-NEXT: movl 20(%esp,%edi), %esi ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: movl %eax, %edx ; FALLBACK20-NEXT: notb %dl ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 24(%esp,%edi), %ebx ; FALLBACK20-NEXT: movl %ebx, %esi ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: movl 28(%esp,%edi), %edi ; FALLBACK20-NEXT: leal (%edi,%edi), %ebp ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: orl %esi, %ebp ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: addl %ebx, %ebx ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %esi, %ebx ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: movl %edi, 12(%edx) ; FALLBACK20-NEXT: movl %ebx, 4(%edx) ; FALLBACK20-NEXT: movl %ebp, 8(%edx) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: movl %eax, (%edx) ; FALLBACK20-NEXT: addl $60, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx ; FALLBACK20-NEXT: popl %ebp ; FALLBACK20-NEXT: retl ; ; FALLBACK21-LABEL: lshr_16bytes: ; FALLBACK21: # %bb.0: ; FALLBACK21-NEXT: pushl %ebp ; FALLBACK21-NEXT: pushl %ebx ; FALLBACK21-NEXT: pushl %edi ; FALLBACK21-NEXT: pushl %esi ; FALLBACK21-NEXT: subl $44, %esp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK21-NEXT: movups (%edx), %xmm0 ; FALLBACK21-NEXT: movzbl (%ecx), %edx ; FALLBACK21-NEXT: movl %edx, %ecx ; FALLBACK21-NEXT: shlb $3, %cl ; FALLBACK21-NEXT: xorps %xmm1, %xmm1 ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, (%esp) ; FALLBACK21-NEXT: andb $12, %dl ; FALLBACK21-NEXT: movzbl %dl, %ebx ; FALLBACK21-NEXT: movl 12(%esp,%ebx), %edx ; FALLBACK21-NEXT: movl 8(%esp,%ebx), %ebp ; FALLBACK21-NEXT: movl %ebp, %edi ; FALLBACK21-NEXT: shrdl %cl, %edx, %edi ; FALLBACK21-NEXT: movl (%esp,%ebx), %esi ; FALLBACK21-NEXT: movl 4(%esp,%ebx), %eax ; FALLBACK21-NEXT: movl %eax, %ebx ; FALLBACK21-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK21-NEXT: movl %ebx, 4(%ebp) ; FALLBACK21-NEXT: movl %edi, 8(%ebp) ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: shrl %cl, %edx ; FALLBACK21-NEXT: movl %edx, 12(%ebp) ; FALLBACK21-NEXT: movl %esi, (%ebp) ; FALLBACK21-NEXT: addl $44, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi ; FALLBACK21-NEXT: popl %ebx ; FALLBACK21-NEXT: popl %ebp ; FALLBACK21-NEXT: retl ; ; FALLBACK22-LABEL: lshr_16bytes: ; FALLBACK22: # %bb.0: ; FALLBACK22-NEXT: pushl %ebp ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi ; FALLBACK22-NEXT: subl $44, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movzbl (%eax), %ecx ; FALLBACK22-NEXT: movl %ecx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, (%esp) ; FALLBACK22-NEXT: andb $12, %cl ; FALLBACK22-NEXT: movzbl %cl, %edi ; FALLBACK22-NEXT: shrxl %eax, (%esp,%edi), %ebx ; FALLBACK22-NEXT: movl %eax, %ecx ; FALLBACK22-NEXT: notb %cl ; FALLBACK22-NEXT: movl 4(%esp,%edi), %ebp ; FALLBACK22-NEXT: movl 8(%esp,%edi), %esi ; FALLBACK22-NEXT: leal (%ebp,%ebp), %edx ; FALLBACK22-NEXT: shlxl %ecx, %edx, %edx ; FALLBACK22-NEXT: orl %ebx, %edx ; FALLBACK22-NEXT: shrxl %eax, %esi, %ebx ; FALLBACK22-NEXT: shrxl %eax, %ebp, %ebp ; FALLBACK22-NEXT: movl 12(%esp,%edi), %edi ; FALLBACK22-NEXT: shrxl %eax, %edi, %eax ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi ; FALLBACK22-NEXT: orl %ebx, %edi ; FALLBACK22-NEXT: addl %esi, %esi ; FALLBACK22-NEXT: shlxl %ecx, %esi, %ecx ; FALLBACK22-NEXT: orl %ebp, %ecx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK22-NEXT: movl %eax, 12(%esi) ; FALLBACK22-NEXT: movl %ecx, 4(%esi) ; FALLBACK22-NEXT: movl %edi, 8(%esi) ; FALLBACK22-NEXT: movl %edx, (%esi) ; FALLBACK22-NEXT: addl $44, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx ; FALLBACK22-NEXT: popl %ebp ; FALLBACK22-NEXT: retl ; ; FALLBACK23-LABEL: lshr_16bytes: ; FALLBACK23: # %bb.0: ; FALLBACK23-NEXT: pushl %ebp ; FALLBACK23-NEXT: pushl %ebx ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi ; FALLBACK23-NEXT: subl $44, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK23-NEXT: movups (%edx), %xmm0 ; FALLBACK23-NEXT: movzbl (%ecx), %edx ; FALLBACK23-NEXT: movl %edx, %ecx ; FALLBACK23-NEXT: shlb $3, %cl ; FALLBACK23-NEXT: xorps %xmm1, %xmm1 ; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, (%esp) ; FALLBACK23-NEXT: andb $12, %dl ; FALLBACK23-NEXT: movzbl %dl, %ebx ; FALLBACK23-NEXT: movl 12(%esp,%ebx), %edx ; FALLBACK23-NEXT: movl 8(%esp,%ebx), %ebp ; FALLBACK23-NEXT: movl %ebp, %edi ; FALLBACK23-NEXT: shrdl %cl, %edx, %edi ; FALLBACK23-NEXT: movl (%esp,%ebx), %esi ; FALLBACK23-NEXT: movl 4(%esp,%ebx), %eax ; FALLBACK23-NEXT: movl %eax, %ebx ; FALLBACK23-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK23-NEXT: movl %ebx, 4(%ebp) ; FALLBACK23-NEXT: movl %edi, 8(%ebp) ; FALLBACK23-NEXT: shrxl %ecx, %edx, %edx ; FALLBACK23-NEXT: movl %edx, 12(%ebp) ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: shrdl %cl, %eax, %esi ; FALLBACK23-NEXT: movl %esi, (%ebp) ; FALLBACK23-NEXT: addl $44, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi ; FALLBACK23-NEXT: popl %ebx ; FALLBACK23-NEXT: popl %ebp ; FALLBACK23-NEXT: retl ; ; FALLBACK24-LABEL: lshr_16bytes: ; FALLBACK24: # %bb.0: ; FALLBACK24-NEXT: pushl %ebp ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi ; FALLBACK24-NEXT: subl $60, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK24-NEXT: movzbl (%eax), %ecx ; FALLBACK24-NEXT: movl %ecx, %eax ; FALLBACK24-NEXT: shlb $3, %al ; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $12, %cl ; FALLBACK24-NEXT: movzbl %cl, %edi ; FALLBACK24-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK24-NEXT: movl 20(%esp,%edi), %esi ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: movl %eax, %edx ; FALLBACK24-NEXT: notb %dl ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 24(%esp,%edi), %ebx ; FALLBACK24-NEXT: movl %ebx, %esi ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: movl 28(%esp,%edi), %edi ; FALLBACK24-NEXT: leal (%edi,%edi), %ebp ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: orl %esi, %ebp ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: addl %ebx, %ebx ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %esi, %ebx ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: movl %edi, 12(%edx) ; FALLBACK24-NEXT: movl %ebx, 4(%edx) ; FALLBACK24-NEXT: movl %ebp, 8(%edx) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: movl %eax, (%edx) ; FALLBACK24-NEXT: addl $60, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx ; FALLBACK24-NEXT: popl %ebp ; FALLBACK24-NEXT: retl ; ; FALLBACK25-LABEL: lshr_16bytes: ; FALLBACK25: # %bb.0: ; FALLBACK25-NEXT: pushl %ebp ; FALLBACK25-NEXT: pushl %ebx ; FALLBACK25-NEXT: pushl %edi ; FALLBACK25-NEXT: pushl %esi ; FALLBACK25-NEXT: subl $44, %esp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK25-NEXT: vmovups (%edx), %xmm0 ; FALLBACK25-NEXT: movzbl (%ecx), %edx ; FALLBACK25-NEXT: movl %edx, %ecx ; FALLBACK25-NEXT: shlb $3, %cl ; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovaps %xmm0, (%esp) ; FALLBACK25-NEXT: andb $12, %dl ; FALLBACK25-NEXT: movzbl %dl, %ebx ; FALLBACK25-NEXT: movl 12(%esp,%ebx), %edx ; FALLBACK25-NEXT: movl 8(%esp,%ebx), %ebp ; FALLBACK25-NEXT: movl %ebp, %edi ; FALLBACK25-NEXT: shrdl %cl, %edx, %edi ; FALLBACK25-NEXT: movl (%esp,%ebx), %esi ; FALLBACK25-NEXT: movl 4(%esp,%ebx), %eax ; FALLBACK25-NEXT: movl %eax, %ebx ; FALLBACK25-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK25-NEXT: movl %ebx, 4(%ebp) ; FALLBACK25-NEXT: movl %edi, 8(%ebp) ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: shrl %cl, %edx ; FALLBACK25-NEXT: movl %edx, 12(%ebp) ; FALLBACK25-NEXT: movl %esi, (%ebp) ; FALLBACK25-NEXT: addl $44, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi ; FALLBACK25-NEXT: popl %ebx ; FALLBACK25-NEXT: popl %ebp ; FALLBACK25-NEXT: retl ; ; FALLBACK26-LABEL: lshr_16bytes: ; FALLBACK26: # %bb.0: ; FALLBACK26-NEXT: pushl %ebp ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi ; FALLBACK26-NEXT: subl $44, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK26-NEXT: movzbl (%eax), %ecx ; FALLBACK26-NEXT: movl %ecx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, (%esp) ; FALLBACK26-NEXT: andb $12, %cl ; FALLBACK26-NEXT: movzbl %cl, %edi ; FALLBACK26-NEXT: shrxl %eax, (%esp,%edi), %ebx ; FALLBACK26-NEXT: movl %eax, %ecx ; FALLBACK26-NEXT: notb %cl ; FALLBACK26-NEXT: movl 4(%esp,%edi), %ebp ; FALLBACK26-NEXT: movl 8(%esp,%edi), %esi ; FALLBACK26-NEXT: leal (%ebp,%ebp), %edx ; FALLBACK26-NEXT: shlxl %ecx, %edx, %edx ; FALLBACK26-NEXT: orl %ebx, %edx ; FALLBACK26-NEXT: shrxl %eax, %esi, %ebx ; FALLBACK26-NEXT: shrxl %eax, %ebp, %ebp ; FALLBACK26-NEXT: movl 12(%esp,%edi), %edi ; FALLBACK26-NEXT: shrxl %eax, %edi, %eax ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi ; FALLBACK26-NEXT: orl %ebx, %edi ; FALLBACK26-NEXT: addl %esi, %esi ; FALLBACK26-NEXT: shlxl %ecx, %esi, %ecx ; FALLBACK26-NEXT: orl %ebp, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK26-NEXT: movl %eax, 12(%esi) ; FALLBACK26-NEXT: movl %ecx, 4(%esi) ; FALLBACK26-NEXT: movl %edi, 8(%esi) ; FALLBACK26-NEXT: movl %edx, (%esi) ; FALLBACK26-NEXT: addl $44, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx ; FALLBACK26-NEXT: popl %ebp ; FALLBACK26-NEXT: retl ; ; FALLBACK27-LABEL: lshr_16bytes: ; FALLBACK27: # %bb.0: ; FALLBACK27-NEXT: pushl %ebp ; FALLBACK27-NEXT: pushl %ebx ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi ; FALLBACK27-NEXT: subl $44, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK27-NEXT: vmovups (%edx), %xmm0 ; FALLBACK27-NEXT: movzbl (%ecx), %edx ; FALLBACK27-NEXT: movl %edx, %ecx ; FALLBACK27-NEXT: shlb $3, %cl ; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovaps %xmm0, (%esp) ; FALLBACK27-NEXT: andb $12, %dl ; FALLBACK27-NEXT: movzbl %dl, %ebx ; FALLBACK27-NEXT: movl 12(%esp,%ebx), %edx ; FALLBACK27-NEXT: movl 8(%esp,%ebx), %ebp ; FALLBACK27-NEXT: movl %ebp, %edi ; FALLBACK27-NEXT: shrdl %cl, %edx, %edi ; FALLBACK27-NEXT: movl (%esp,%ebx), %esi ; FALLBACK27-NEXT: movl 4(%esp,%ebx), %eax ; FALLBACK27-NEXT: movl %eax, %ebx ; FALLBACK27-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK27-NEXT: movl %ebx, 4(%ebp) ; FALLBACK27-NEXT: movl %edi, 8(%ebp) ; FALLBACK27-NEXT: shrxl %ecx, %edx, %edx ; FALLBACK27-NEXT: movl %edx, 12(%ebp) ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: shrdl %cl, %eax, %esi ; FALLBACK27-NEXT: movl %esi, (%ebp) ; FALLBACK27-NEXT: addl $44, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi ; FALLBACK27-NEXT: popl %ebx ; FALLBACK27-NEXT: popl %ebp ; FALLBACK27-NEXT: retl ; ; FALLBACK28-LABEL: lshr_16bytes: ; FALLBACK28: # %bb.0: ; FALLBACK28-NEXT: pushl %ebp ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi ; FALLBACK28-NEXT: subl $60, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK28-NEXT: movzbl (%eax), %ecx ; FALLBACK28-NEXT: movl %ecx, %eax ; FALLBACK28-NEXT: shlb $3, %al ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $12, %cl ; FALLBACK28-NEXT: movzbl %cl, %edi ; FALLBACK28-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK28-NEXT: movl 20(%esp,%edi), %esi ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: movl %eax, %edx ; FALLBACK28-NEXT: notb %dl ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 24(%esp,%edi), %ebx ; FALLBACK28-NEXT: movl %ebx, %esi ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: movl 28(%esp,%edi), %edi ; FALLBACK28-NEXT: leal (%edi,%edi), %ebp ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: orl %esi, %ebp ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: addl %ebx, %ebx ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %esi, %ebx ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: movl %edi, 12(%edx) ; FALLBACK28-NEXT: movl %ebx, 4(%edx) ; FALLBACK28-NEXT: movl %ebp, 8(%edx) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: movl %eax, (%edx) ; FALLBACK28-NEXT: addl $60, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx ; FALLBACK28-NEXT: popl %ebp ; FALLBACK28-NEXT: retl ; ; FALLBACK29-LABEL: lshr_16bytes: ; FALLBACK29: # %bb.0: ; FALLBACK29-NEXT: pushl %ebp ; FALLBACK29-NEXT: pushl %ebx ; FALLBACK29-NEXT: pushl %edi ; FALLBACK29-NEXT: pushl %esi ; FALLBACK29-NEXT: subl $44, %esp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK29-NEXT: vmovups (%edx), %xmm0 ; FALLBACK29-NEXT: movzbl (%ecx), %edx ; FALLBACK29-NEXT: movl %edx, %ecx ; FALLBACK29-NEXT: shlb $3, %cl ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovaps %xmm0, (%esp) ; FALLBACK29-NEXT: andb $12, %dl ; FALLBACK29-NEXT: movzbl %dl, %ebx ; FALLBACK29-NEXT: movl 12(%esp,%ebx), %edx ; FALLBACK29-NEXT: movl 8(%esp,%ebx), %ebp ; FALLBACK29-NEXT: movl %ebp, %edi ; FALLBACK29-NEXT: shrdl %cl, %edx, %edi ; FALLBACK29-NEXT: movl (%esp,%ebx), %esi ; FALLBACK29-NEXT: movl 4(%esp,%ebx), %eax ; FALLBACK29-NEXT: movl %eax, %ebx ; FALLBACK29-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK29-NEXT: movl %ebx, 4(%ebp) ; FALLBACK29-NEXT: movl %edi, 8(%ebp) ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: shrl %cl, %edx ; FALLBACK29-NEXT: movl %edx, 12(%ebp) ; FALLBACK29-NEXT: movl %esi, (%ebp) ; FALLBACK29-NEXT: addl $44, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi ; FALLBACK29-NEXT: popl %ebx ; FALLBACK29-NEXT: popl %ebp ; FALLBACK29-NEXT: retl ; ; FALLBACK30-LABEL: lshr_16bytes: ; FALLBACK30: # %bb.0: ; FALLBACK30-NEXT: pushl %ebp ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi ; FALLBACK30-NEXT: subl $44, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK30-NEXT: movzbl (%eax), %ecx ; FALLBACK30-NEXT: movl %ecx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, (%esp) ; FALLBACK30-NEXT: andb $12, %cl ; FALLBACK30-NEXT: movzbl %cl, %edi ; FALLBACK30-NEXT: shrxl %eax, (%esp,%edi), %ebx ; FALLBACK30-NEXT: movl %eax, %ecx ; FALLBACK30-NEXT: notb %cl ; FALLBACK30-NEXT: movl 4(%esp,%edi), %ebp ; FALLBACK30-NEXT: movl 8(%esp,%edi), %esi ; FALLBACK30-NEXT: leal (%ebp,%ebp), %edx ; FALLBACK30-NEXT: shlxl %ecx, %edx, %edx ; FALLBACK30-NEXT: orl %ebx, %edx ; FALLBACK30-NEXT: shrxl %eax, %esi, %ebx ; FALLBACK30-NEXT: shrxl %eax, %ebp, %ebp ; FALLBACK30-NEXT: movl 12(%esp,%edi), %edi ; FALLBACK30-NEXT: shrxl %eax, %edi, %eax ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi ; FALLBACK30-NEXT: orl %ebx, %edi ; FALLBACK30-NEXT: addl %esi, %esi ; FALLBACK30-NEXT: shlxl %ecx, %esi, %ecx ; FALLBACK30-NEXT: orl %ebp, %ecx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK30-NEXT: movl %eax, 12(%esi) ; FALLBACK30-NEXT: movl %ecx, 4(%esi) ; FALLBACK30-NEXT: movl %edi, 8(%esi) ; FALLBACK30-NEXT: movl %edx, (%esi) ; FALLBACK30-NEXT: addl $44, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx ; FALLBACK30-NEXT: popl %ebp ; FALLBACK30-NEXT: retl ; ; FALLBACK31-LABEL: lshr_16bytes: ; FALLBACK31: # %bb.0: ; FALLBACK31-NEXT: pushl %ebp ; FALLBACK31-NEXT: pushl %ebx ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi ; FALLBACK31-NEXT: subl $44, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK31-NEXT: vmovups (%edx), %xmm0 ; FALLBACK31-NEXT: movzbl (%ecx), %edx ; FALLBACK31-NEXT: movl %edx, %ecx ; FALLBACK31-NEXT: shlb $3, %cl ; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovaps %xmm0, (%esp) ; FALLBACK31-NEXT: andb $12, %dl ; FALLBACK31-NEXT: movzbl %dl, %ebx ; FALLBACK31-NEXT: movl 12(%esp,%ebx), %edx ; FALLBACK31-NEXT: movl 8(%esp,%ebx), %ebp ; FALLBACK31-NEXT: movl %ebp, %edi ; FALLBACK31-NEXT: shrdl %cl, %edx, %edi ; FALLBACK31-NEXT: movl (%esp,%ebx), %esi ; FALLBACK31-NEXT: movl 4(%esp,%ebx), %eax ; FALLBACK31-NEXT: movl %eax, %ebx ; FALLBACK31-NEXT: shrdl %cl, %ebp, %ebx ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK31-NEXT: movl %ebx, 4(%ebp) ; FALLBACK31-NEXT: movl %edi, 8(%ebp) ; FALLBACK31-NEXT: shrxl %ecx, %edx, %edx ; FALLBACK31-NEXT: movl %edx, 12(%ebp) ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: shrdl %cl, %eax, %esi ; FALLBACK31-NEXT: movl %esi, (%ebp) ; FALLBACK31-NEXT: addl $44, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi ; FALLBACK31-NEXT: popl %ebx ; FALLBACK31-NEXT: popl %ebp ; FALLBACK31-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 %bitOff = shl i128 %byteOff, 3 %res = lshr i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } define void @lshr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; X64-NO-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff: ; X64-NO-SHLD-NO-BMI2: # %bb.0: ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al ; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %rdi, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %rdi, %rcx ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, 8(%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-NO-BMI2-LABEL: lshr_16bytes_dwordOff: ; X64-HAVE-SHLD-NO-BMI2: # %bb.0: ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %edi, %edi ; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: retq ; ; X64-NO-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff: ; X64-NO-SHLD-HAVE-BMI2: # %bb.0: ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi ; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil ; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rax, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi ; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, 8(%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-HAVE-BMI2-LABEL: lshr_16bytes_dwordOff: ; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, %rdi, %rsi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %edi, %edi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; ; X86-SSE2-LABEL: lshr_16bytes_dwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $32, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE2-NEXT: movl (%edx), %esi ; X86-SSE2-NEXT: movl 4(%edx), %edi ; X86-SSE2-NEXT: movl 8(%edx), %ebx ; X86-SSE2-NEXT: movl 12(%edx), %edx ; X86-SSE2-NEXT: movzbl (%ecx), %ecx ; X86-SSE2-NEXT: xorps %xmm0, %xmm0 ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, (%esp) ; X86-SSE2-NEXT: andl $3, %ecx ; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx ; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi ; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi ; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl %edi, 12(%eax) ; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: movl %esi, 4(%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: lshr_16bytes_dwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: subl $44, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm1, %xmm1 ; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, (%esp) ; X86-SSE42-NEXT: andl $3, %ecx ; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0 ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $44, %esp ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: lshr_16bytes_dwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: subl $44, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %xmm0 ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovaps %xmm0, (%esp) ; X86-AVX-NEXT: andl $3, %ecx ; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0 ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $44, %esp ; X86-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %dwordOff = load i128, ptr %dwordOff.ptr, align 1 %bitOff = shl i128 %dwordOff, 5 %res = lshr i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } define void @shl_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes: ; X64-NO-SHLD-NO-BMI2: # %bb.0: ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: orq %rdi, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al ; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rcx ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, (%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, 8(%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes: ; X64-HAVE-SHLD-NO-BMI2: # %bb.0: ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shldq %cl, %rax, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %eax, %eax ; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: retq ; ; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes: ; X64-NO-SHLD-HAVE-BMI2: # %bb.0: ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, 8(%rdi), %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi ; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %r8 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrq %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rdi, %rax, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi ; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %r8, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %r8, %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes: ; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shldq %cl, %rax, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; ; FALLBACK16-LABEL: shl_16bytes: ; FALLBACK16: # %bb.0: ; FALLBACK16-NEXT: pushl %ebp ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $60, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK16-NEXT: movl (%ecx), %ebx ; FALLBACK16-NEXT: movl 4(%ecx), %esi ; FALLBACK16-NEXT: movl 8(%ecx), %edi ; FALLBACK16-NEXT: movl 12(%ecx), %ecx ; FALLBACK16-NEXT: movb (%eax), %ah ; FALLBACK16-NEXT: movb %ah, %dh ; FALLBACK16-NEXT: shlb $3, %dh ; FALLBACK16-NEXT: xorps %xmm0, %xmm0 ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: andb $12, %ah ; FALLBACK16-NEXT: negb %ah ; FALLBACK16-NEXT: movsbl %ah, %ebp ; FALLBACK16-NEXT: movl 32(%esp,%ebp), %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 36(%esp,%ebp), %esi ; FALLBACK16-NEXT: movl %esi, %edi ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: movb %dh, %dl ; FALLBACK16-NEXT: notb %dl ; FALLBACK16-NEXT: shrl %ebx ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: orl %edi, %ebx ; FALLBACK16-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: movl 40(%esp,%ebp), %edi ; FALLBACK16-NEXT: movl %edi, %ebp ; FALLBACK16-NEXT: shrl %ebp ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: shrl %esi ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: orl %edi, %esi ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: movl %edx, (%eax) ; FALLBACK16-NEXT: movl %esi, 8(%eax) ; FALLBACK16-NEXT: movl %ebp, 12(%eax) ; FALLBACK16-NEXT: movl %ebx, 4(%eax) ; FALLBACK16-NEXT: addl $60, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx ; FALLBACK16-NEXT: popl %ebp ; FALLBACK16-NEXT: retl ; ; FALLBACK17-LABEL: shl_16bytes: ; FALLBACK17: # %bb.0: ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $32, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK17-NEXT: movl (%edx), %esi ; FALLBACK17-NEXT: movl 4(%edx), %edi ; FALLBACK17-NEXT: movl 8(%edx), %ebx ; FALLBACK17-NEXT: movl 12(%edx), %edx ; FALLBACK17-NEXT: movb (%ecx), %ch ; FALLBACK17-NEXT: movb %ch, %cl ; FALLBACK17-NEXT: shlb $3, %cl ; FALLBACK17-NEXT: xorps %xmm0, %xmm0 ; FALLBACK17-NEXT: movaps %xmm0, (%esp) ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: andb $12, %ch ; FALLBACK17-NEXT: negb %ch ; FALLBACK17-NEXT: movsbl %ch, %edi ; FALLBACK17-NEXT: movl 24(%esp,%edi), %esi ; FALLBACK17-NEXT: movl 28(%esp,%edi), %edx ; FALLBACK17-NEXT: shldl %cl, %esi, %edx ; FALLBACK17-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK17-NEXT: movl 20(%esp,%edi), %edi ; FALLBACK17-NEXT: shldl %cl, %edi, %esi ; FALLBACK17-NEXT: shldl %cl, %ebx, %edi ; FALLBACK17-NEXT: shll %cl, %ebx ; FALLBACK17-NEXT: movl %esi, 8(%eax) ; FALLBACK17-NEXT: movl %edx, 12(%eax) ; FALLBACK17-NEXT: movl %ebx, (%eax) ; FALLBACK17-NEXT: movl %edi, 4(%eax) ; FALLBACK17-NEXT: addl $32, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx ; FALLBACK17-NEXT: retl ; ; FALLBACK18-LABEL: shl_16bytes: ; FALLBACK18: # %bb.0: ; FALLBACK18-NEXT: pushl %ebp ; FALLBACK18-NEXT: pushl %ebx ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $44, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK18-NEXT: movl (%ecx), %edx ; FALLBACK18-NEXT: movl 4(%ecx), %esi ; FALLBACK18-NEXT: movl 8(%ecx), %edi ; FALLBACK18-NEXT: movl 12(%ecx), %ecx ; FALLBACK18-NEXT: movzbl (%eax), %eax ; FALLBACK18-NEXT: movl %eax, %ebx ; FALLBACK18-NEXT: shlb $3, %bl ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, (%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: andb $12, %al ; FALLBACK18-NEXT: negb %al ; FALLBACK18-NEXT: movsbl %al, %edx ; FALLBACK18-NEXT: movl 16(%esp,%edx), %edi ; FALLBACK18-NEXT: movl 20(%esp,%edx), %ecx ; FALLBACK18-NEXT: shlxl %ebx, %ecx, %esi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %ebp ; FALLBACK18-NEXT: movl %ebx, %eax ; FALLBACK18-NEXT: notb %al ; FALLBACK18-NEXT: shrl %edi ; FALLBACK18-NEXT: shrxl %eax, %edi, %edi ; FALLBACK18-NEXT: orl %esi, %edi ; FALLBACK18-NEXT: shlxl %ebx, 28(%esp,%edx), %esi ; FALLBACK18-NEXT: movl 24(%esp,%edx), %edx ; FALLBACK18-NEXT: shlxl %ebx, %edx, %ebx ; FALLBACK18-NEXT: shrl %edx ; FALLBACK18-NEXT: shrxl %eax, %edx, %edx ; FALLBACK18-NEXT: orl %esi, %edx ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %eax, %ecx, %eax ; FALLBACK18-NEXT: orl %ebx, %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK18-NEXT: movl %ebp, (%ecx) ; FALLBACK18-NEXT: movl %eax, 8(%ecx) ; FALLBACK18-NEXT: movl %edx, 12(%ecx) ; FALLBACK18-NEXT: movl %edi, 4(%ecx) ; FALLBACK18-NEXT: addl $44, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi ; FALLBACK18-NEXT: popl %ebx ; FALLBACK18-NEXT: popl %ebp ; FALLBACK18-NEXT: retl ; ; FALLBACK19-LABEL: shl_16bytes: ; FALLBACK19: # %bb.0: ; FALLBACK19-NEXT: pushl %ebp ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $44, %esp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK19-NEXT: movl (%edx), %esi ; FALLBACK19-NEXT: movl 4(%edx), %edi ; FALLBACK19-NEXT: movl 8(%edx), %ebx ; FALLBACK19-NEXT: movl 12(%edx), %edx ; FALLBACK19-NEXT: movzbl (%ecx), %eax ; FALLBACK19-NEXT: movl %eax, %ecx ; FALLBACK19-NEXT: shlb $3, %cl ; FALLBACK19-NEXT: xorps %xmm0, %xmm0 ; FALLBACK19-NEXT: movaps %xmm0, (%esp) ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: andb $12, %al ; FALLBACK19-NEXT: negb %al ; FALLBACK19-NEXT: movsbl %al, %eax ; FALLBACK19-NEXT: movl 24(%esp,%eax), %esi ; FALLBACK19-NEXT: movl 28(%esp,%eax), %edx ; FALLBACK19-NEXT: shldl %cl, %esi, %edx ; FALLBACK19-NEXT: movl 16(%esp,%eax), %edi ; FALLBACK19-NEXT: movl 20(%esp,%eax), %eax ; FALLBACK19-NEXT: shldl %cl, %eax, %esi ; FALLBACK19-NEXT: shldl %cl, %edi, %eax ; FALLBACK19-NEXT: shlxl %ecx, %edi, %ecx ; FALLBACK19-NEXT: movl %esi, 8(%ebp) ; FALLBACK19-NEXT: movl %edx, 12(%ebp) ; FALLBACK19-NEXT: movl %ecx, (%ebp) ; FALLBACK19-NEXT: movl %eax, 4(%ebp) ; FALLBACK19-NEXT: addl $44, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx ; FALLBACK19-NEXT: popl %ebp ; FALLBACK19-NEXT: retl ; ; FALLBACK20-LABEL: shl_16bytes: ; FALLBACK20: # %bb.0: ; FALLBACK20-NEXT: pushl %ebp ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi ; FALLBACK20-NEXT: subl $60, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movzbl (%eax), %ecx ; FALLBACK20-NEXT: movl %ecx, %eax ; FALLBACK20-NEXT: shlb $3, %al ; FALLBACK20-NEXT: xorps %xmm1, %xmm1 ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $12, %cl ; FALLBACK20-NEXT: negb %cl ; FALLBACK20-NEXT: movsbl %cl, %edi ; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl %eax, %edx ; FALLBACK20-NEXT: notb %dl ; FALLBACK20-NEXT: movl 40(%esp,%edi), %ebp ; FALLBACK20-NEXT: movl %ebp, %esi ; FALLBACK20-NEXT: shrl %esi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: movl 32(%esp,%edi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx ; FALLBACK20-NEXT: movl %ebx, %edi ; FALLBACK20-NEXT: shrl %edi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: orl %ebp, %edi ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK20-NEXT: shrl %ebp ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: orl %ebx, %ebp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: movl %eax, (%edx) ; FALLBACK20-NEXT: movl %ebp, 4(%edx) ; FALLBACK20-NEXT: movl %edi, 8(%edx) ; FALLBACK20-NEXT: movl %esi, 12(%edx) ; FALLBACK20-NEXT: addl $60, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx ; FALLBACK20-NEXT: popl %ebp ; FALLBACK20-NEXT: retl ; ; FALLBACK21-LABEL: shl_16bytes: ; FALLBACK21: # %bb.0: ; FALLBACK21-NEXT: pushl %ebp ; FALLBACK21-NEXT: pushl %ebx ; FALLBACK21-NEXT: pushl %edi ; FALLBACK21-NEXT: pushl %esi ; FALLBACK21-NEXT: subl $44, %esp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK21-NEXT: movups (%edx), %xmm0 ; FALLBACK21-NEXT: movzbl (%ecx), %edx ; FALLBACK21-NEXT: movl %edx, %ecx ; FALLBACK21-NEXT: shlb $3, %cl ; FALLBACK21-NEXT: xorps %xmm1, %xmm1 ; FALLBACK21-NEXT: movaps %xmm1, (%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: andb $12, %dl ; FALLBACK21-NEXT: negb %dl ; FALLBACK21-NEXT: movsbl %dl, %edi ; FALLBACK21-NEXT: movl 24(%esp,%edi), %esi ; FALLBACK21-NEXT: movl 28(%esp,%edi), %edx ; FALLBACK21-NEXT: shldl %cl, %esi, %edx ; FALLBACK21-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK21-NEXT: movl 20(%esp,%edi), %edi ; FALLBACK21-NEXT: shldl %cl, %edi, %esi ; FALLBACK21-NEXT: movl %ebx, %ebp ; FALLBACK21-NEXT: shll %cl, %ebp ; FALLBACK21-NEXT: shldl %cl, %ebx, %edi ; FALLBACK21-NEXT: movl %edi, 4(%eax) ; FALLBACK21-NEXT: movl %esi, 8(%eax) ; FALLBACK21-NEXT: movl %edx, 12(%eax) ; FALLBACK21-NEXT: movl %ebp, (%eax) ; FALLBACK21-NEXT: addl $44, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi ; FALLBACK21-NEXT: popl %ebx ; FALLBACK21-NEXT: popl %ebp ; FALLBACK21-NEXT: retl ; ; FALLBACK22-LABEL: shl_16bytes: ; FALLBACK22: # %bb.0: ; FALLBACK22-NEXT: pushl %ebp ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi ; FALLBACK22-NEXT: subl $44, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movzbl (%eax), %ecx ; FALLBACK22-NEXT: movl %ecx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm1, %xmm1 ; FALLBACK22-NEXT: movaps %xmm1, (%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: andb $12, %cl ; FALLBACK22-NEXT: negb %cl ; FALLBACK22-NEXT: movsbl %cl, %ecx ; FALLBACK22-NEXT: shlxl %eax, 28(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl 24(%esp,%ecx), %edx ; FALLBACK22-NEXT: shlxl %eax, %edx, %edi ; FALLBACK22-NEXT: movl %eax, %ebx ; FALLBACK22-NEXT: notb %bl ; FALLBACK22-NEXT: shrl %edx ; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx ; FALLBACK22-NEXT: orl %esi, %edx ; FALLBACK22-NEXT: movl 20(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl %esi, %ebp ; FALLBACK22-NEXT: shrl %ebp ; FALLBACK22-NEXT: shrxl %ebx, %ebp, %ebp ; FALLBACK22-NEXT: orl %edi, %ebp ; FALLBACK22-NEXT: shlxl %eax, %esi, %esi ; FALLBACK22-NEXT: movl 16(%esp,%ecx), %ecx ; FALLBACK22-NEXT: shlxl %eax, %ecx, %eax ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: orl %esi, %ecx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK22-NEXT: movl %eax, (%esi) ; FALLBACK22-NEXT: movl %ecx, 4(%esi) ; FALLBACK22-NEXT: movl %ebp, 8(%esi) ; FALLBACK22-NEXT: movl %edx, 12(%esi) ; FALLBACK22-NEXT: addl $44, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx ; FALLBACK22-NEXT: popl %ebp ; FALLBACK22-NEXT: retl ; ; FALLBACK23-LABEL: shl_16bytes: ; FALLBACK23: # %bb.0: ; FALLBACK23-NEXT: pushl %ebp ; FALLBACK23-NEXT: pushl %ebx ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi ; FALLBACK23-NEXT: subl $44, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK23-NEXT: movups (%edx), %xmm0 ; FALLBACK23-NEXT: movzbl (%ecx), %edx ; FALLBACK23-NEXT: movl %edx, %ecx ; FALLBACK23-NEXT: shlb $3, %cl ; FALLBACK23-NEXT: xorps %xmm1, %xmm1 ; FALLBACK23-NEXT: movaps %xmm1, (%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: andb $12, %dl ; FALLBACK23-NEXT: negb %dl ; FALLBACK23-NEXT: movsbl %dl, %edi ; FALLBACK23-NEXT: movl 24(%esp,%edi), %esi ; FALLBACK23-NEXT: movl 28(%esp,%edi), %edx ; FALLBACK23-NEXT: shldl %cl, %esi, %edx ; FALLBACK23-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK23-NEXT: movl 20(%esp,%edi), %edi ; FALLBACK23-NEXT: shldl %cl, %edi, %esi ; FALLBACK23-NEXT: shlxl %ecx, %ebx, %ebp ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: shldl %cl, %ebx, %edi ; FALLBACK23-NEXT: movl %edi, 4(%eax) ; FALLBACK23-NEXT: movl %esi, 8(%eax) ; FALLBACK23-NEXT: movl %edx, 12(%eax) ; FALLBACK23-NEXT: movl %ebp, (%eax) ; FALLBACK23-NEXT: addl $44, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi ; FALLBACK23-NEXT: popl %ebx ; FALLBACK23-NEXT: popl %ebp ; FALLBACK23-NEXT: retl ; ; FALLBACK24-LABEL: shl_16bytes: ; FALLBACK24: # %bb.0: ; FALLBACK24-NEXT: pushl %ebp ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi ; FALLBACK24-NEXT: subl $60, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK24-NEXT: movzbl (%eax), %ecx ; FALLBACK24-NEXT: movl %ecx, %eax ; FALLBACK24-NEXT: shlb $3, %al ; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $12, %cl ; FALLBACK24-NEXT: negb %cl ; FALLBACK24-NEXT: movsbl %cl, %edi ; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl %eax, %edx ; FALLBACK24-NEXT: notb %dl ; FALLBACK24-NEXT: movl 40(%esp,%edi), %ebp ; FALLBACK24-NEXT: movl %ebp, %esi ; FALLBACK24-NEXT: shrl %esi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: movl 32(%esp,%edi), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx ; FALLBACK24-NEXT: movl %ebx, %edi ; FALLBACK24-NEXT: shrl %edi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: orl %ebp, %edi ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK24-NEXT: shrl %ebp ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: orl %ebx, %ebp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: shll %cl, %eax ; FALLBACK24-NEXT: movl %eax, (%edx) ; FALLBACK24-NEXT: movl %ebp, 4(%edx) ; FALLBACK24-NEXT: movl %edi, 8(%edx) ; FALLBACK24-NEXT: movl %esi, 12(%edx) ; FALLBACK24-NEXT: addl $60, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx ; FALLBACK24-NEXT: popl %ebp ; FALLBACK24-NEXT: retl ; ; FALLBACK25-LABEL: shl_16bytes: ; FALLBACK25: # %bb.0: ; FALLBACK25-NEXT: pushl %ebp ; FALLBACK25-NEXT: pushl %ebx ; FALLBACK25-NEXT: pushl %edi ; FALLBACK25-NEXT: pushl %esi ; FALLBACK25-NEXT: subl $44, %esp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK25-NEXT: vmovups (%edx), %xmm0 ; FALLBACK25-NEXT: movzbl (%ecx), %edx ; FALLBACK25-NEXT: movl %edx, %ecx ; FALLBACK25-NEXT: shlb $3, %cl ; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK25-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: andb $12, %dl ; FALLBACK25-NEXT: negb %dl ; FALLBACK25-NEXT: movsbl %dl, %edi ; FALLBACK25-NEXT: movl 24(%esp,%edi), %esi ; FALLBACK25-NEXT: movl 28(%esp,%edi), %edx ; FALLBACK25-NEXT: shldl %cl, %esi, %edx ; FALLBACK25-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK25-NEXT: movl 20(%esp,%edi), %edi ; FALLBACK25-NEXT: shldl %cl, %edi, %esi ; FALLBACK25-NEXT: movl %ebx, %ebp ; FALLBACK25-NEXT: shll %cl, %ebp ; FALLBACK25-NEXT: shldl %cl, %ebx, %edi ; FALLBACK25-NEXT: movl %edi, 4(%eax) ; FALLBACK25-NEXT: movl %esi, 8(%eax) ; FALLBACK25-NEXT: movl %edx, 12(%eax) ; FALLBACK25-NEXT: movl %ebp, (%eax) ; FALLBACK25-NEXT: addl $44, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi ; FALLBACK25-NEXT: popl %ebx ; FALLBACK25-NEXT: popl %ebp ; FALLBACK25-NEXT: retl ; ; FALLBACK26-LABEL: shl_16bytes: ; FALLBACK26: # %bb.0: ; FALLBACK26-NEXT: pushl %ebp ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi ; FALLBACK26-NEXT: subl $44, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK26-NEXT: movzbl (%eax), %ecx ; FALLBACK26-NEXT: movl %ecx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: andb $12, %cl ; FALLBACK26-NEXT: negb %cl ; FALLBACK26-NEXT: movsbl %cl, %ecx ; FALLBACK26-NEXT: shlxl %eax, 28(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 24(%esp,%ecx), %edx ; FALLBACK26-NEXT: shlxl %eax, %edx, %edi ; FALLBACK26-NEXT: movl %eax, %ebx ; FALLBACK26-NEXT: notb %bl ; FALLBACK26-NEXT: shrl %edx ; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx ; FALLBACK26-NEXT: orl %esi, %edx ; FALLBACK26-NEXT: movl 20(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl %esi, %ebp ; FALLBACK26-NEXT: shrl %ebp ; FALLBACK26-NEXT: shrxl %ebx, %ebp, %ebp ; FALLBACK26-NEXT: orl %edi, %ebp ; FALLBACK26-NEXT: shlxl %eax, %esi, %esi ; FALLBACK26-NEXT: movl 16(%esp,%ecx), %ecx ; FALLBACK26-NEXT: shlxl %eax, %ecx, %eax ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: orl %esi, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK26-NEXT: movl %eax, (%esi) ; FALLBACK26-NEXT: movl %ecx, 4(%esi) ; FALLBACK26-NEXT: movl %ebp, 8(%esi) ; FALLBACK26-NEXT: movl %edx, 12(%esi) ; FALLBACK26-NEXT: addl $44, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx ; FALLBACK26-NEXT: popl %ebp ; FALLBACK26-NEXT: retl ; ; FALLBACK27-LABEL: shl_16bytes: ; FALLBACK27: # %bb.0: ; FALLBACK27-NEXT: pushl %ebp ; FALLBACK27-NEXT: pushl %ebx ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi ; FALLBACK27-NEXT: subl $44, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK27-NEXT: vmovups (%edx), %xmm0 ; FALLBACK27-NEXT: movzbl (%ecx), %edx ; FALLBACK27-NEXT: movl %edx, %ecx ; FALLBACK27-NEXT: shlb $3, %cl ; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK27-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: andb $12, %dl ; FALLBACK27-NEXT: negb %dl ; FALLBACK27-NEXT: movsbl %dl, %edi ; FALLBACK27-NEXT: movl 24(%esp,%edi), %esi ; FALLBACK27-NEXT: movl 28(%esp,%edi), %edx ; FALLBACK27-NEXT: shldl %cl, %esi, %edx ; FALLBACK27-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK27-NEXT: movl 20(%esp,%edi), %edi ; FALLBACK27-NEXT: shldl %cl, %edi, %esi ; FALLBACK27-NEXT: shlxl %ecx, %ebx, %ebp ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: shldl %cl, %ebx, %edi ; FALLBACK27-NEXT: movl %edi, 4(%eax) ; FALLBACK27-NEXT: movl %esi, 8(%eax) ; FALLBACK27-NEXT: movl %edx, 12(%eax) ; FALLBACK27-NEXT: movl %ebp, (%eax) ; FALLBACK27-NEXT: addl $44, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi ; FALLBACK27-NEXT: popl %ebx ; FALLBACK27-NEXT: popl %ebp ; FALLBACK27-NEXT: retl ; ; FALLBACK28-LABEL: shl_16bytes: ; FALLBACK28: # %bb.0: ; FALLBACK28-NEXT: pushl %ebp ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi ; FALLBACK28-NEXT: subl $60, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK28-NEXT: movzbl (%eax), %ecx ; FALLBACK28-NEXT: movl %ecx, %eax ; FALLBACK28-NEXT: shlb $3, %al ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $12, %cl ; FALLBACK28-NEXT: negb %cl ; FALLBACK28-NEXT: movsbl %cl, %edi ; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl %eax, %edx ; FALLBACK28-NEXT: notb %dl ; FALLBACK28-NEXT: movl 40(%esp,%edi), %ebp ; FALLBACK28-NEXT: movl %ebp, %esi ; FALLBACK28-NEXT: shrl %esi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: movl 32(%esp,%edi), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx ; FALLBACK28-NEXT: movl %ebx, %edi ; FALLBACK28-NEXT: shrl %edi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: orl %ebp, %edi ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK28-NEXT: shrl %ebp ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: orl %ebx, %ebp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: movl %eax, (%edx) ; FALLBACK28-NEXT: movl %ebp, 4(%edx) ; FALLBACK28-NEXT: movl %edi, 8(%edx) ; FALLBACK28-NEXT: movl %esi, 12(%edx) ; FALLBACK28-NEXT: addl $60, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx ; FALLBACK28-NEXT: popl %ebp ; FALLBACK28-NEXT: retl ; ; FALLBACK29-LABEL: shl_16bytes: ; FALLBACK29: # %bb.0: ; FALLBACK29-NEXT: pushl %ebp ; FALLBACK29-NEXT: pushl %ebx ; FALLBACK29-NEXT: pushl %edi ; FALLBACK29-NEXT: pushl %esi ; FALLBACK29-NEXT: subl $44, %esp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK29-NEXT: vmovups (%edx), %xmm0 ; FALLBACK29-NEXT: movzbl (%ecx), %edx ; FALLBACK29-NEXT: movl %edx, %ecx ; FALLBACK29-NEXT: shlb $3, %cl ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: andb $12, %dl ; FALLBACK29-NEXT: negb %dl ; FALLBACK29-NEXT: movsbl %dl, %edi ; FALLBACK29-NEXT: movl 24(%esp,%edi), %esi ; FALLBACK29-NEXT: movl 28(%esp,%edi), %edx ; FALLBACK29-NEXT: shldl %cl, %esi, %edx ; FALLBACK29-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK29-NEXT: movl 20(%esp,%edi), %edi ; FALLBACK29-NEXT: shldl %cl, %edi, %esi ; FALLBACK29-NEXT: movl %ebx, %ebp ; FALLBACK29-NEXT: shll %cl, %ebp ; FALLBACK29-NEXT: shldl %cl, %ebx, %edi ; FALLBACK29-NEXT: movl %edi, 4(%eax) ; FALLBACK29-NEXT: movl %esi, 8(%eax) ; FALLBACK29-NEXT: movl %edx, 12(%eax) ; FALLBACK29-NEXT: movl %ebp, (%eax) ; FALLBACK29-NEXT: addl $44, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi ; FALLBACK29-NEXT: popl %ebx ; FALLBACK29-NEXT: popl %ebp ; FALLBACK29-NEXT: retl ; ; FALLBACK30-LABEL: shl_16bytes: ; FALLBACK30: # %bb.0: ; FALLBACK30-NEXT: pushl %ebp ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi ; FALLBACK30-NEXT: subl $44, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK30-NEXT: movzbl (%eax), %ecx ; FALLBACK30-NEXT: movl %ecx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: andb $12, %cl ; FALLBACK30-NEXT: negb %cl ; FALLBACK30-NEXT: movsbl %cl, %ecx ; FALLBACK30-NEXT: shlxl %eax, 28(%esp,%ecx), %esi ; FALLBACK30-NEXT: movl 24(%esp,%ecx), %edx ; FALLBACK30-NEXT: shlxl %eax, %edx, %edi ; FALLBACK30-NEXT: movl %eax, %ebx ; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: shrl %edx ; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx ; FALLBACK30-NEXT: orl %esi, %edx ; FALLBACK30-NEXT: movl 20(%esp,%ecx), %esi ; FALLBACK30-NEXT: movl %esi, %ebp ; FALLBACK30-NEXT: shrl %ebp ; FALLBACK30-NEXT: shrxl %ebx, %ebp, %ebp ; FALLBACK30-NEXT: orl %edi, %ebp ; FALLBACK30-NEXT: shlxl %eax, %esi, %esi ; FALLBACK30-NEXT: movl 16(%esp,%ecx), %ecx ; FALLBACK30-NEXT: shlxl %eax, %ecx, %eax ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: orl %esi, %ecx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK30-NEXT: movl %eax, (%esi) ; FALLBACK30-NEXT: movl %ecx, 4(%esi) ; FALLBACK30-NEXT: movl %ebp, 8(%esi) ; FALLBACK30-NEXT: movl %edx, 12(%esi) ; FALLBACK30-NEXT: addl $44, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx ; FALLBACK30-NEXT: popl %ebp ; FALLBACK30-NEXT: retl ; ; FALLBACK31-LABEL: shl_16bytes: ; FALLBACK31: # %bb.0: ; FALLBACK31-NEXT: pushl %ebp ; FALLBACK31-NEXT: pushl %ebx ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi ; FALLBACK31-NEXT: subl $44, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK31-NEXT: vmovups (%edx), %xmm0 ; FALLBACK31-NEXT: movzbl (%ecx), %edx ; FALLBACK31-NEXT: movl %edx, %ecx ; FALLBACK31-NEXT: shlb $3, %cl ; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK31-NEXT: vmovaps %xmm1, (%esp) ; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: andb $12, %dl ; FALLBACK31-NEXT: negb %dl ; FALLBACK31-NEXT: movsbl %dl, %edi ; FALLBACK31-NEXT: movl 24(%esp,%edi), %esi ; FALLBACK31-NEXT: movl 28(%esp,%edi), %edx ; FALLBACK31-NEXT: shldl %cl, %esi, %edx ; FALLBACK31-NEXT: movl 16(%esp,%edi), %ebx ; FALLBACK31-NEXT: movl 20(%esp,%edi), %edi ; FALLBACK31-NEXT: shldl %cl, %edi, %esi ; FALLBACK31-NEXT: shlxl %ecx, %ebx, %ebp ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: shldl %cl, %ebx, %edi ; FALLBACK31-NEXT: movl %edi, 4(%eax) ; FALLBACK31-NEXT: movl %esi, 8(%eax) ; FALLBACK31-NEXT: movl %edx, 12(%eax) ; FALLBACK31-NEXT: movl %ebp, (%eax) ; FALLBACK31-NEXT: addl $44, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi ; FALLBACK31-NEXT: popl %ebx ; FALLBACK31-NEXT: popl %ebp ; FALLBACK31-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 %bitOff = shl i128 %byteOff, 3 %res = shl i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } define void @shl_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; X64-NO-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff: ; X64-NO-SHLD-NO-BMI2: # %bb.0: ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: orq %rdi, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: xorl %ecx, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al ; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rcx ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rcx, (%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, 8(%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-NO-BMI2-LABEL: shl_16bytes_dwordOff: ; X64-HAVE-SHLD-NO-BMI2: # %bb.0: ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shldq %cl, %rax, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: xorl %eax, %eax ; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: retq ; ; X64-NO-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff: ; X64-NO-SHLD-HAVE-BMI2: # %bb.0: ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, 8(%rdi), %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi ; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %r8 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrq %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rdi, %rax, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi ; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %r8, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %r8, %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-HAVE-BMI2-LABEL: shl_16bytes_dwordOff: ; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shldq %cl, %rax, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlxq %rcx, %rax, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: xorl %esi, %esi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rax, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rax, %rsi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rsi, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; ; X86-SSE2-LABEL: shl_16bytes_dwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $32, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE2-NEXT: movl (%edx), %esi ; X86-SSE2-NEXT: movl 4(%edx), %edi ; X86-SSE2-NEXT: movl 8(%edx), %ebx ; X86-SSE2-NEXT: movl 12(%edx), %edx ; X86-SSE2-NEXT: movzbl (%ecx), %ecx ; X86-SSE2-NEXT: xorps %xmm0, %xmm0 ; X86-SSE2-NEXT: movaps %xmm0, (%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: shlb $2, %cl ; X86-SSE2-NEXT: andb $12, %cl ; X86-SSE2-NEXT: negb %cl ; X86-SSE2-NEXT: movsbl %cl, %ecx ; X86-SSE2-NEXT: movl 16(%esp,%ecx), %edx ; X86-SSE2-NEXT: movl 20(%esp,%ecx), %esi ; X86-SSE2-NEXT: movl 28(%esp,%ecx), %edi ; X86-SSE2-NEXT: movl 24(%esp,%ecx), %ecx ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl %edi, 12(%eax) ; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: movl %esi, 4(%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: shl_16bytes_dwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: subl $44, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm1, %xmm1 ; X86-SSE42-NEXT: movaps %xmm1, (%esp) ; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: shlb $2, %cl ; X86-SSE42-NEXT: andb $12, %cl ; X86-SSE42-NEXT: negb %cl ; X86-SSE42-NEXT: movsbl %cl, %ecx ; X86-SSE42-NEXT: movups 16(%esp,%ecx), %xmm0 ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $44, %esp ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: shl_16bytes_dwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: subl $44, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %xmm0 ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vmovaps %xmm1, (%esp) ; X86-AVX-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: shlb $2, %cl ; X86-AVX-NEXT: andb $12, %cl ; X86-AVX-NEXT: negb %cl ; X86-AVX-NEXT: movsbl %cl, %ecx ; X86-AVX-NEXT: vmovups 16(%esp,%ecx), %xmm0 ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $44, %esp ; X86-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %dwordOff = load i128, ptr %dwordOff.ptr, align 1 %bitOff = shl i128 %dwordOff, 5 %res = shl i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } define void @ashr_16bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes: ; X64-NO-SHLD-NO-BMI2: # %bb.0: ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: sarq %cl, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al ; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes: ; X64-HAVE-SHLD-NO-BMI2: # %bb.0: ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq %cl, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq $63, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: retq ; ; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes: ; X64-NO-SHLD-HAVE-BMI2: # %bb.0: ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi ; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil ; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rax, %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: sarq $63, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes: ; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rdi, %rsi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarq $63, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; ; X86-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes: ; X86-NO-SHLD-NO-BMI2: # %bb.0: ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebp ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: subl $60, %esp ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl (%ecx), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl 4(%ecx), %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl 8(%ecx), %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl 12(%ecx), %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movb (%eax), %ah ; X86-NO-SHLD-NO-BMI2-NEXT: movb %ah, %al ; X86-NO-SHLD-NO-BMI2-NEXT: shlb $3, %al ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: sarl $31, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-NO-BMI2-NEXT: andb $12, %ah ; X86-NO-SHLD-NO-BMI2-NEXT: movzbl %ah, %ebp ; X86-NO-SHLD-NO-BMI2-NEXT: movl 20(%esp,%ebp), %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %edx ; X86-NO-SHLD-NO-BMI2-NEXT: notb %dl ; X86-NO-SHLD-NO-BMI2-NEXT: movl 24(%esp,%ebp), %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ecx,%ecx), %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %edi ; X86-NO-SHLD-NO-BMI2-NEXT: movl 16(%esp,%ebp), %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: addl %esi, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: orl %ebx, %esi ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NO-SHLD-NO-BMI2-NEXT: movl 28(%esp,%ebp), %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: leal (%ebx,%ebx), %ebp ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edx, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: shll %cl, %ebp ; X86-NO-SHLD-NO-BMI2-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; X86-NO-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X86-NO-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %ebp, 8(%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %esi, (%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: movl %edi, 4(%edx) ; X86-NO-SHLD-NO-BMI2-NEXT: addl $60, %esp ; X86-NO-SHLD-NO-BMI2-NEXT: popl %esi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %edi ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebx ; X86-NO-SHLD-NO-BMI2-NEXT: popl %ebp ; X86-NO-SHLD-NO-BMI2-NEXT: retl ; ; X86-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes: ; X86-HAVE-SHLD-NO-BMI2: # %bb.0: ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebp ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %ebx ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: subl $44, %esp ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%edx), %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%edx), %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%edx), %ebx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%edx), %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movb (%ecx), %ch ; X86-HAVE-SHLD-NO-BMI2-NEXT: movb %ch, %cl ; X86-HAVE-SHLD-NO-BMI2-NEXT: shlb $3, %cl ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, (%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl $31, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-NO-BMI2-NEXT: andb $12, %ch ; X86-HAVE-SHLD-NO-BMI2-NEXT: movzbl %ch, %ebx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 8(%esp,%ebx), %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl (%esp,%ebx), %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 4(%esp,%ebx), %ebp ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebp, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %esi, %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl 12(%esp,%ebx), %ebx ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebx, %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: shrdl %cl, %ebp, %edx ; X86-HAVE-SHLD-NO-BMI2-NEXT: sarl %cl, %ebx ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %esi, 8(%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %ebx, 12(%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edx, (%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: movl %edi, 4(%eax) ; X86-HAVE-SHLD-NO-BMI2-NEXT: addl $44, %esp ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebx ; X86-HAVE-SHLD-NO-BMI2-NEXT: popl %ebp ; X86-HAVE-SHLD-NO-BMI2-NEXT: retl ; ; X86-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes: ; X86-NO-SHLD-HAVE-BMI2: # %bb.0: ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebp ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: subl $44, %esp ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl (%ecx), %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%ecx), %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%ecx), %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%ecx), %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%eax), %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ebx, %eax ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlb $3, %al ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, (%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: sarl $31, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-SHLD-HAVE-BMI2-NEXT: andb $12, %bl ; X86-NO-SHLD-HAVE-BMI2-NEXT: movzbl %bl, %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%esi), %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%esi), %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %edi, %ebp ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: notb %dl ; X86-NO-SHLD-HAVE-BMI2-NEXT: leal (%ebx,%ebx), %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %ecx, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %ecx ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, (%esp,%esi), %ebp ; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %edi, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %edi, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebp, %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: shrxl %eax, %ebx, %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%esi), %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: sarxl %eax, %esi, %eax ; X86-NO-SHLD-HAVE-BMI2-NEXT: addl %esi, %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: shlxl %edx, %esi, %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: orl %ebx, %edx ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%esi) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edx, 8(%esi) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %edi, (%esi) ; X86-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, 4(%esi) ; X86-NO-SHLD-HAVE-BMI2-NEXT: addl $44, %esp ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebx ; X86-NO-SHLD-HAVE-BMI2-NEXT: popl %ebp ; X86-NO-SHLD-HAVE-BMI2-NEXT: retl ; ; X86-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes: ; X86-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebp ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %ebx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: pushl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: subl $44, %esp ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%edx), %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%edx), %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%edx), %ebx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%edx), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%ecx), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, %ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $3, %cl ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, (%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarl $31, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: andb $12, %al ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl %al, %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 8(%esp,%eax), %ebx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl (%esp,%eax), %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 4(%esp,%eax), %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %esi, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %ebx, %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl 12(%esp,%eax), %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %eax, %ebx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %ebx, 8(%ebp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: sarxl %ecx, %eax, %eax ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %eax, 12(%ebp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: shrdl %cl, %esi, %edx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edx, (%ebp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: movl %edi, 4(%ebp) ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: addl $44, %esp ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %esi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %edi ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebx ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: popl %ebp ; X86-HAVE-SHLD-HAVE-BMI2-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %byteOff = load i128, ptr %byteOff.ptr, align 1 %bitOff = shl i128 %byteOff, 3 %res = ashr i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } define void @ashr_16bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; X64-NO-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff: ; X64-NO-SHLD-NO-BMI2: # %bb.0: ; X64-NO-SHLD-NO-BMI2-NEXT: movq (%rdi), %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %eax ; X64-NO-SHLD-NO-BMI2-NEXT: shlb $5, %al ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: shrq %cl, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: leaq (%rdi,%rdi), %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: notb %cl ; X64-NO-SHLD-NO-BMI2-NEXT: shlq %cl, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: orq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: movl %eax, %ecx ; X64-NO-SHLD-NO-BMI2-NEXT: sarq %cl, %r8 ; X64-NO-SHLD-NO-BMI2-NEXT: sarq $63, %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: testb $64, %al ; X64-NO-SHLD-NO-BMI2-NEXT: cmovneq %r8, %rsi ; X64-NO-SHLD-NO-BMI2-NEXT: cmoveq %r8, %rdi ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: movq %rsi, (%rdx) ; X64-NO-SHLD-NO-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-NO-BMI2-LABEL: ashr_16bytes_dwordOff: ; X64-HAVE-SHLD-NO-BMI2: # %bb.0: ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-NO-BMI2-NEXT: shlb $5, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq %cl, %rsi ; X64-HAVE-SHLD-NO-BMI2-NEXT: shrdq %cl, %rdi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: sarq $63, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmovneq %rsi, %rax ; X64-HAVE-SHLD-NO-BMI2-NEXT: cmoveq %rsi, %rdi ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-NO-BMI2-NEXT: retq ; ; X64-NO-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff: ; X64-NO-SHLD-HAVE-BMI2: # %bb.0: ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: shrxq %rcx, (%rdi), %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: movl %ecx, %edi ; X64-NO-SHLD-HAVE-BMI2-NEXT: notb %dil ; X64-NO-SHLD-HAVE-BMI2-NEXT: leaq (%rax,%rax), %r8 ; X64-NO-SHLD-HAVE-BMI2-NEXT: shlxq %rdi, %r8, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: orq %rsi, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rax, %rsi ; X64-NO-SHLD-HAVE-BMI2-NEXT: sarq $63, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rdi ; X64-NO-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rax ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rax, 8(%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: movq %rdi, (%rdx) ; X64-NO-SHLD-HAVE-BMI2-NEXT: retq ; ; X64-HAVE-SHLD-HAVE-BMI2-LABEL: ashr_16bytes_dwordOff: ; X64-HAVE-SHLD-HAVE-BMI2: # %bb.0: ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq (%rdi), %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq 8(%rdi), %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movzbl (%rsi), %ecx ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shlb $5, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: shrdq %cl, %rdi, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarxq %rcx, %rdi, %rsi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: sarq $63, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: testb $64, %cl ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmovneq %rsi, %rax ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: cmoveq %rsi, %rdi ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: movq %rax, (%rdx) ; X64-HAVE-SHLD-HAVE-BMI2-NEXT: retq ; ; X86-SSE2-LABEL: ashr_16bytes_dwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $32, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE2-NEXT: movl (%edx), %esi ; X86-SSE2-NEXT: movl 4(%edx), %edi ; X86-SSE2-NEXT: movl 8(%edx), %ebx ; X86-SSE2-NEXT: movl 12(%edx), %edx ; X86-SSE2-NEXT: movzbl (%ecx), %ecx ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, (%esp) ; X86-SSE2-NEXT: sarl $31, %edx ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andl $3, %ecx ; X86-SSE2-NEXT: movl (%esp,%ecx,4), %edx ; X86-SSE2-NEXT: movl 4(%esp,%ecx,4), %esi ; X86-SSE2-NEXT: movl 12(%esp,%ecx,4), %edi ; X86-SSE2-NEXT: movl 8(%esp,%ecx,4), %ecx ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl %edi, 12(%eax) ; X86-SSE2-NEXT: movl %edx, (%eax) ; X86-SSE2-NEXT: movl %esi, 4(%eax) ; X86-SSE2-NEXT: addl $32, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: ashr_16bytes_dwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: pushl %ebx ; X86-SSE42-NEXT: pushl %edi ; X86-SSE42-NEXT: pushl %esi ; X86-SSE42-NEXT: subl $32, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movl (%edx), %esi ; X86-SSE42-NEXT: movl 4(%edx), %edi ; X86-SSE42-NEXT: movl 8(%edx), %ebx ; X86-SSE42-NEXT: movl 12(%edx), %edx ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %esi, (%esp) ; X86-SSE42-NEXT: sarl $31, %edx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: andl $3, %ecx ; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0 ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $32, %esp ; X86-SSE42-NEXT: popl %esi ; X86-SSE42-NEXT: popl %edi ; X86-SSE42-NEXT: popl %ebx ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: ashr_16bytes_dwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: pushl %esi ; X86-AVX-NEXT: subl $32, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: movl (%edx), %esi ; X86-AVX-NEXT: movl 4(%edx), %edi ; X86-AVX-NEXT: movl 8(%edx), %ebx ; X86-AVX-NEXT: movl 12(%edx), %edx ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %esi, (%esp) ; X86-AVX-NEXT: sarl $31, %edx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: andl $3, %ecx ; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0 ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $32, %esp ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: popl %ebx ; X86-AVX-NEXT: retl %src = load i128, ptr %src.ptr, align 1 %dwordOff = load i128, ptr %dwordOff.ptr, align 1 %bitOff = shl i128 %dwordOff, 5 %res = ashr i128 %src, %bitOff store i128 %res, ptr %dst, align 1 ret void } define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: lshr_32bytes: ; FALLBACK0: # %bb.0: ; FALLBACK0-NEXT: pushq %rbx ; FALLBACK0-NEXT: movq (%rdi), %rcx ; FALLBACK0-NEXT: movq 8(%rdi), %r8 ; FALLBACK0-NEXT: movq 16(%rdi), %r9 ; FALLBACK0-NEXT: movq 24(%rdi), %rdi ; FALLBACK0-NEXT: movzbl (%rsi), %esi ; FALLBACK0-NEXT: leal (,%rsi,8), %eax ; FALLBACK0-NEXT: xorps %xmm0, %xmm0 ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: andb $24, %sil ; FALLBACK0-NEXT: movzbl %sil, %r9d ; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 ; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi ; FALLBACK0-NEXT: movq %rdi, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx ; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 ; FALLBACK0-NEXT: orq %r11, %r8 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r10 ; FALLBACK0-NEXT: addq %rdi, %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rdi ; FALLBACK0-NEXT: orq %r10, %rdi ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx ; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 ; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: orq %rbx, %r10 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r9 ; FALLBACK0-NEXT: movq %r9, 24(%rdx) ; FALLBACK0-NEXT: movq %r10, 16(%rdx) ; FALLBACK0-NEXT: movq %rdi, (%rdx) ; FALLBACK0-NEXT: movq %r8, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: lshr_32bytes: ; FALLBACK1: # %bb.0: ; FALLBACK1-NEXT: movq (%rdi), %rax ; FALLBACK1-NEXT: movq 8(%rdi), %r8 ; FALLBACK1-NEXT: movq 16(%rdi), %r9 ; FALLBACK1-NEXT: movq 24(%rdi), %rdi ; FALLBACK1-NEXT: movzbl (%rsi), %esi ; FALLBACK1-NEXT: leal (,%rsi,8), %ecx ; FALLBACK1-NEXT: xorps %xmm0, %xmm0 ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: andb $24, %sil ; FALLBACK1-NEXT: movzbl %sil, %eax ; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi ; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi ; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 ; FALLBACK1-NEXT: movq %r8, %r9 ; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 ; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax ; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi ; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK1-NEXT: shrq %cl, %rax ; FALLBACK1-NEXT: movq %rsi, 16(%rdx) ; FALLBACK1-NEXT: movq %rax, 24(%rdx) ; FALLBACK1-NEXT: movq %rdi, (%rdx) ; FALLBACK1-NEXT: movq %r9, 8(%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: lshr_32bytes: ; FALLBACK2: # %bb.0: ; FALLBACK2-NEXT: movq (%rdi), %rcx ; FALLBACK2-NEXT: movq 8(%rdi), %r8 ; FALLBACK2-NEXT: movq 16(%rdi), %r9 ; FALLBACK2-NEXT: movq 24(%rdi), %rdi ; FALLBACK2-NEXT: movzbl (%rsi), %esi ; FALLBACK2-NEXT: leal (,%rsi,8), %eax ; FALLBACK2-NEXT: xorps %xmm0, %xmm0 ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: movzbl %sil, %ecx ; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi ; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi ; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 ; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 ; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx ; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK2-NEXT: orq %r8, %rdi ; FALLBACK2-NEXT: addq %rsi, %rsi ; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi ; FALLBACK2-NEXT: orq %r9, %rsi ; FALLBACK2-NEXT: addq %rcx, %rcx ; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) ; FALLBACK2-NEXT: movq %rsi, (%rdx) ; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes: ; FALLBACK3: # %bb.0: ; FALLBACK3-NEXT: movq (%rdi), %rax ; FALLBACK3-NEXT: movq 8(%rdi), %r8 ; FALLBACK3-NEXT: movq 16(%rdi), %r9 ; FALLBACK3-NEXT: movq 24(%rdi), %rdi ; FALLBACK3-NEXT: movzbl (%rsi), %esi ; FALLBACK3-NEXT: leal (,%rsi,8), %ecx ; FALLBACK3-NEXT: xorps %xmm0, %xmm0 ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: andb $24, %sil ; FALLBACK3-NEXT: movzbl %sil, %eax ; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi ; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi ; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 ; FALLBACK3-NEXT: movq %r8, %r9 ; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 ; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax ; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi ; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi ; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax ; FALLBACK3-NEXT: movq %rsi, 16(%rdx) ; FALLBACK3-NEXT: movq %rax, 24(%rdx) ; FALLBACK3-NEXT: movq %rdi, (%rdx) ; FALLBACK3-NEXT: movq %r9, 8(%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: lshr_32bytes: ; FALLBACK4: # %bb.0: ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movzbl (%rsi), %ecx ; FALLBACK4-NEXT: leal (,%rcx,8), %eax ; FALLBACK4-NEXT: xorps %xmm2, %xmm2 ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: andb $24, %cl ; FALLBACK4-NEXT: movzbl %cl, %r9d ; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 ; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi ; FALLBACK4-NEXT: orq %r10, %rdi ; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 ; FALLBACK4-NEXT: movq %r10, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 ; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 ; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rbx ; FALLBACK4-NEXT: orq %r11, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 ; FALLBACK4-NEXT: addq %r10, %r10 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: orq %r8, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, 24(%rdx) ; FALLBACK4-NEXT: movq %r10, 8(%rdx) ; FALLBACK4-NEXT: movq %rbx, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: lshr_32bytes: ; FALLBACK5: # %bb.0: ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK5-NEXT: movzbl (%rsi), %eax ; FALLBACK5-NEXT: leal (,%rax,8), %ecx ; FALLBACK5-NEXT: xorps %xmm2, %xmm2 ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: andb $24, %al ; FALLBACK5-NEXT: movzbl %al, %eax ; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK5-NEXT: movq %rdi, %r8 ; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK5-NEXT: movq %rax, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: shrq %cl, %rsi ; FALLBACK5-NEXT: movq %r10, 8(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) ; FALLBACK5-NEXT: movq %r9, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: lshr_32bytes: ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movzbl (%rsi), %ecx ; FALLBACK6-NEXT: leal (,%rcx,8), %eax ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: andb $24, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx ; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi ; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi ; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 ; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx ; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK6-NEXT: notb %al ; FALLBACK6-NEXT: addq %rdi, %rdi ; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %rsi, %rdi ; FALLBACK6-NEXT: addq %rcx, %rcx ; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK6-NEXT: orq %r9, %rcx ; FALLBACK6-NEXT: addq %r8, %r8 ; FALLBACK6-NEXT: shlxq %rax, %r8, %rax ; FALLBACK6-NEXT: orq %r10, %rax ; FALLBACK6-NEXT: movq %r11, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) ; FALLBACK6-NEXT: movq %rcx, 16(%rdx) ; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes: ; FALLBACK7: # %bb.0: ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK7-NEXT: movzbl (%rsi), %eax ; FALLBACK7-NEXT: leal (,%rax,8), %ecx ; FALLBACK7-NEXT: xorps %xmm2, %xmm2 ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: andb $24, %al ; FALLBACK7-NEXT: movzbl %al, %eax ; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK7-NEXT: movq %rdi, %r8 ; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK7-NEXT: movq %rax, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax ; FALLBACK7-NEXT: movq %r10, 8(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rax, 24(%rdx) ; FALLBACK7-NEXT: movq %r9, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: lshr_32bytes: ; FALLBACK8: # %bb.0: ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: movzbl (%rsi), %ecx ; FALLBACK8-NEXT: leal (,%rcx,8), %eax ; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: andb $24, %cl ; FALLBACK8-NEXT: movzbl %cl, %r9d ; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 ; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi ; FALLBACK8-NEXT: orq %r10, %rdi ; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 ; FALLBACK8-NEXT: movq %r10, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 ; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 ; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rbx ; FALLBACK8-NEXT: orq %r11, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 ; FALLBACK8-NEXT: addq %r10, %r10 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: orq %r8, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, 24(%rdx) ; FALLBACK8-NEXT: movq %r10, 8(%rdx) ; FALLBACK8-NEXT: movq %rbx, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: lshr_32bytes: ; FALLBACK9: # %bb.0: ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: movzbl (%rsi), %eax ; FALLBACK9-NEXT: leal (,%rax,8), %ecx ; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: andb $24, %al ; FALLBACK9-NEXT: movzbl %al, %eax ; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK9-NEXT: movq %rdi, %r8 ; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK9-NEXT: movq %rax, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %rsi ; FALLBACK9-NEXT: movq %r10, 8(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) ; FALLBACK9-NEXT: movq %r9, (%rdx) ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: lshr_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: movzbl (%rsi), %ecx ; FALLBACK10-NEXT: leal (,%rcx,8), %eax ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: andb $24, %cl ; FALLBACK10-NEXT: movzbl %cl, %ecx ; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi ; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi ; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 ; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx ; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK10-NEXT: notb %al ; FALLBACK10-NEXT: addq %rdi, %rdi ; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %rsi, %rdi ; FALLBACK10-NEXT: addq %rcx, %rcx ; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK10-NEXT: orq %r9, %rcx ; FALLBACK10-NEXT: addq %r8, %r8 ; FALLBACK10-NEXT: shlxq %rax, %r8, %rax ; FALLBACK10-NEXT: orq %r10, %rax ; FALLBACK10-NEXT: movq %r11, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) ; FALLBACK10-NEXT: movq %rcx, 16(%rdx) ; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: lshr_32bytes: ; FALLBACK11: # %bb.0: ; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK11-NEXT: movzbl (%rsi), %eax ; FALLBACK11-NEXT: leal (,%rax,8), %ecx ; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: andb $24, %al ; FALLBACK11-NEXT: movzbl %al, %eax ; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK11-NEXT: movq %rdi, %r8 ; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK11-NEXT: movq %rax, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax ; FALLBACK11-NEXT: movq %r10, 8(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) ; FALLBACK11-NEXT: movq %rax, 24(%rdx) ; FALLBACK11-NEXT: movq %r9, (%rdx) ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: lshr_32bytes: ; FALLBACK12: # %bb.0: ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK12-NEXT: movzbl (%rsi), %ecx ; FALLBACK12-NEXT: leal (,%rcx,8), %eax ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: andb $24, %cl ; FALLBACK12-NEXT: movzbl %cl, %r9d ; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 ; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi ; FALLBACK12-NEXT: orq %r10, %rdi ; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 ; FALLBACK12-NEXT: movq %r10, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 ; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 ; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r11, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 ; FALLBACK12-NEXT: addq %r10, %r10 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: orq %r8, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 24(%rdx) ; FALLBACK12-NEXT: movq %r10, 8(%rdx) ; FALLBACK12-NEXT: movq %rbx, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: lshr_32bytes: ; FALLBACK13: # %bb.0: ; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK13-NEXT: movzbl (%rsi), %eax ; FALLBACK13-NEXT: leal (,%rax,8), %ecx ; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: andb $24, %al ; FALLBACK13-NEXT: movzbl %al, %eax ; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK13-NEXT: movq %rdi, %r8 ; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK13-NEXT: movq %rax, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: shrq %cl, %rsi ; FALLBACK13-NEXT: movq %r10, 8(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) ; FALLBACK13-NEXT: movq %r9, (%rdx) ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: lshr_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK14-NEXT: movzbl (%rsi), %ecx ; FALLBACK14-NEXT: leal (,%rcx,8), %eax ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: andb $24, %cl ; FALLBACK14-NEXT: movzbl %cl, %ecx ; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi ; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi ; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 ; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx ; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK14-NEXT: notb %al ; FALLBACK14-NEXT: addq %rdi, %rdi ; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %rsi, %rdi ; FALLBACK14-NEXT: addq %rcx, %rcx ; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK14-NEXT: orq %r9, %rcx ; FALLBACK14-NEXT: addq %r8, %r8 ; FALLBACK14-NEXT: shlxq %rax, %r8, %rax ; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: movq %r11, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) ; FALLBACK14-NEXT: movq %rcx, 16(%rdx) ; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: lshr_32bytes: ; FALLBACK15: # %bb.0: ; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK15-NEXT: movzbl (%rsi), %eax ; FALLBACK15-NEXT: leal (,%rax,8), %ecx ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: andb $24, %al ; FALLBACK15-NEXT: movzbl %al, %eax ; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK15-NEXT: movq %rdi, %r8 ; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK15-NEXT: movq %rax, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax ; FALLBACK15-NEXT: movq %r10, 8(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) ; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r9, (%rdx) ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; ; FALLBACK16-LABEL: lshr_32bytes: ; FALLBACK16: # %bb.0: ; FALLBACK16-NEXT: pushl %ebp ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $108, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK16-NEXT: movl (%ebp), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 4(%ebp), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 8(%ebp), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 12(%ebp), %edi ; FALLBACK16-NEXT: movl 16(%ebp), %ebx ; FALLBACK16-NEXT: movb (%eax), %ah ; FALLBACK16-NEXT: movl 20(%ebp), %esi ; FALLBACK16-NEXT: movl 24(%ebp), %ecx ; FALLBACK16-NEXT: movl 28(%ebp), %ebp ; FALLBACK16-NEXT: xorps %xmm0, %xmm0 ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movb %ah, %dh ; FALLBACK16-NEXT: shlb $3, %dh ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: andb $28, %ah ; FALLBACK16-NEXT: movzbl %ah, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi ; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax ; FALLBACK16-NEXT: movl %eax, %ebx ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movb %dh, %dl ; FALLBACK16-NEXT: notb %dl ; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi ; FALLBACK16-NEXT: leal (%edi,%edi), %ebp ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %ebx, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: movl %eax, %ebx ; FALLBACK16-NEXT: addl %eax, %ebx ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %esi, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp ; FALLBACK16-NEXT: movl %ebp, %esi ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: movl 48(%esp,%eax), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: leal (%eax,%eax), %ebx ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %esi, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %edi, %ebp ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl 52(%esp,%eax), %edi ; FALLBACK16-NEXT: movl %edi, %ebx ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movl 56(%esp,%eax), %esi ; FALLBACK16-NEXT: leal (%esi,%esi), %eax ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: orl %ebx, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: addl %edi, %edi ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: orl %ebx, %edi ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: movl %esi, %eax ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl 60(%esp,%ecx), %ebx ; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: orl %eax, %esi ; FALLBACK16-NEXT: movb %dh, %cl ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl %ebx, 28(%eax) ; FALLBACK16-NEXT: movl %esi, 24(%eax) ; FALLBACK16-NEXT: movl %edi, 16(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 20(%eax) ; FALLBACK16-NEXT: movl %ebp, 8(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 12(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, (%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 4(%eax) ; FALLBACK16-NEXT: addl $108, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx ; FALLBACK16-NEXT: popl %ebp ; FALLBACK16-NEXT: retl ; ; FALLBACK17-LABEL: lshr_32bytes: ; FALLBACK17: # %bb.0: ; FALLBACK17-NEXT: pushl %ebp ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $92, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK17-NEXT: movl (%ebp), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 4(%ebp), %eax ; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: movl 8(%ebp), %esi ; FALLBACK17-NEXT: movl 12(%ebp), %edi ; FALLBACK17-NEXT: movl 16(%ebp), %ebx ; FALLBACK17-NEXT: movb (%ecx), %ch ; FALLBACK17-NEXT: movl 20(%ebp), %edx ; FALLBACK17-NEXT: movl 24(%ebp), %eax ; FALLBACK17-NEXT: movl 28(%ebp), %ebp ; FALLBACK17-NEXT: xorps %xmm0, %xmm0 ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movb %ch, %cl ; FALLBACK17-NEXT: shlb $3, %cl ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: andb $28, %ch ; FALLBACK17-NEXT: movzbl %ch, %ebp ; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx ; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %edx, %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx ; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %esi ; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx ; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edi ; FALLBACK17-NEXT: shrdl %cl, %edx, %edi ; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi ; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK17-NEXT: movl %edx, 24(%ebp) ; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload ; FALLBACK17-NEXT: shrdl %cl, %edx, %esi ; FALLBACK17-NEXT: shrl %cl, %eax ; FALLBACK17-NEXT: movl %eax, 28(%ebp) ; FALLBACK17-NEXT: movl %ebx, 16(%ebp) ; FALLBACK17-NEXT: movl %edi, 20(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 8(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 12(%ebp) ; FALLBACK17-NEXT: movl %esi, (%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 4(%ebp) ; FALLBACK17-NEXT: addl $92, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx ; FALLBACK17-NEXT: popl %ebp ; FALLBACK17-NEXT: retl ; ; FALLBACK18-LABEL: lshr_32bytes: ; FALLBACK18: # %bb.0: ; FALLBACK18-NEXT: pushl %ebp ; FALLBACK18-NEXT: pushl %ebx ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl (%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 4(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 8(%eax), %esi ; FALLBACK18-NEXT: movl 12(%eax), %edi ; FALLBACK18-NEXT: movl 16(%eax), %ebp ; FALLBACK18-NEXT: movzbl (%ebx), %ebx ; FALLBACK18-NEXT: movl 20(%eax), %edx ; FALLBACK18-NEXT: movl 24(%eax), %ecx ; FALLBACK18-NEXT: movl 28(%eax), %eax ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, %eax ; FALLBACK18-NEXT: shlb $3, %al ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: andb $28, %bl ; FALLBACK18-NEXT: movzbl %bl, %edi ; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %eax, %esi, %edx ; FALLBACK18-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl %eax, %edx ; FALLBACK18-NEXT: movl %eax, %ebx ; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl %ebx, %ecx ; FALLBACK18-NEXT: shrxl %ebx, 32(%esp,%edi), %ebx ; FALLBACK18-NEXT: addl %esi, %esi ; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: orl %ebx, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 48(%esp,%edi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %ebx ; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi ; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp ; FALLBACK18-NEXT: movl %ecx, %eax ; FALLBACK18-NEXT: shrxl %ecx, %ebp, %ebx ; FALLBACK18-NEXT: orl %ebx, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %eax, %ebx ; FALLBACK18-NEXT: addl %ebp, %ebp ; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl %ecx, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp ; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx ; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx ; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax ; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi ; FALLBACK18-NEXT: orl %esi, %ecx ; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: addl %eax, %eax ; FALLBACK18-NEXT: shlxl %edx, %eax, %esi ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax ; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi ; FALLBACK18-NEXT: shrxl %ebx, %edi, %ebx ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %edx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl %ebx, 28(%eax) ; FALLBACK18-NEXT: movl %edi, 24(%eax) ; FALLBACK18-NEXT: movl %esi, 16(%eax) ; FALLBACK18-NEXT: movl %ecx, 20(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 8(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 12(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, (%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 4(%eax) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi ; FALLBACK18-NEXT: popl %ebx ; FALLBACK18-NEXT: popl %ebp ; FALLBACK18-NEXT: retl ; ; FALLBACK19-LABEL: lshr_32bytes: ; FALLBACK19: # %bb.0: ; FALLBACK19-NEXT: pushl %ebp ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $92, %esp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl (%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 4(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: movl 8(%ecx), %esi ; FALLBACK19-NEXT: movl 12(%ecx), %edi ; FALLBACK19-NEXT: movl 16(%ecx), %ebp ; FALLBACK19-NEXT: movzbl (%ebx), %ebx ; FALLBACK19-NEXT: movl 20(%ecx), %edx ; FALLBACK19-NEXT: movl 24(%ecx), %eax ; FALLBACK19-NEXT: movl 28(%ecx), %ecx ; FALLBACK19-NEXT: xorps %xmm0, %xmm0 ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebx, %ecx ; FALLBACK19-NEXT: shlb $3, %cl ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: andb $28, %bl ; FALLBACK19-NEXT: movzbl %bl, %ebp ; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi ; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %esi, %eax ; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx ; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx ; FALLBACK19-NEXT: movl %edx, %esi ; FALLBACK19-NEXT: shrdl %cl, %eax, %esi ; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx ; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi ; FALLBACK19-NEXT: shrdl %cl, %edi, %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl %eax, 24(%ebp) ; FALLBACK19-NEXT: shrxl %ecx, %edi, %eax ; FALLBACK19-NEXT: movl %eax, 28(%ebp) ; FALLBACK19-NEXT: movl %ebx, 16(%ebp) ; FALLBACK19-NEXT: movl %esi, 20(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 8(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 12(%ebp) ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: shrdl %cl, %eax, %edx ; FALLBACK19-NEXT: movl %edx, (%ebp) ; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 4(%ebp) ; FALLBACK19-NEXT: addl $92, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx ; FALLBACK19-NEXT: popl %ebp ; FALLBACK19-NEXT: retl ; ; FALLBACK20-LABEL: lshr_32bytes: ; FALLBACK20: # %bb.0: ; FALLBACK20-NEXT: pushl %ebp ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi ; FALLBACK20-NEXT: subl $108, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK20-NEXT: movzbl (%eax), %ecx ; FALLBACK20-NEXT: movl %ecx, %eax ; FALLBACK20-NEXT: shlb $3, %al ; FALLBACK20-NEXT: xorps %xmm2, %xmm2 ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $28, %cl ; FALLBACK20-NEXT: movzbl %cl, %edi ; FALLBACK20-NEXT: movl 32(%esp,%edi), %esi ; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: movl %eax, %edx ; FALLBACK20-NEXT: notb %dl ; FALLBACK20-NEXT: addl %ebx, %ebx ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %esi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebp ; FALLBACK20-NEXT: movl %ebp, %esi ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: movl 48(%esp,%edi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %esi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi ; FALLBACK20-NEXT: movl %esi, %ebx ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: addl %ebp, %ebp ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: orl %ebx, %ebp ; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp ; FALLBACK20-NEXT: movl %ebp, %ebx ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx ; FALLBACK20-NEXT: movl %ecx, (%esp) # 4-byte Spill ; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: orl %ebx, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: addl %ebp, %ebp ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: orl %edi, %ebp ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl 60(%esp,%ecx), %ebx ; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: orl (%esp), %edi # 4-byte Folded Reload ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl %ebx, 28(%eax) ; FALLBACK20-NEXT: movl %esi, 4(%eax) ; FALLBACK20-NEXT: movl %edi, 24(%eax) ; FALLBACK20-NEXT: movl %ebp, 16(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 20(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 8(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 12(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, (%eax) ; FALLBACK20-NEXT: addl $108, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx ; FALLBACK20-NEXT: popl %ebp ; FALLBACK20-NEXT: retl ; ; FALLBACK21-LABEL: lshr_32bytes: ; FALLBACK21: # %bb.0: ; FALLBACK21-NEXT: pushl %ebp ; FALLBACK21-NEXT: pushl %ebx ; FALLBACK21-NEXT: pushl %edi ; FALLBACK21-NEXT: pushl %esi ; FALLBACK21-NEXT: subl $108, %esp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK21-NEXT: movups (%ecx), %xmm0 ; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK21-NEXT: movzbl (%eax), %eax ; FALLBACK21-NEXT: movl %eax, %ecx ; FALLBACK21-NEXT: shlb $3, %cl ; FALLBACK21-NEXT: xorps %xmm2, %xmm2 ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: andb $28, %al ; FALLBACK21-NEXT: movzbl %al, %ebp ; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi ; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx ; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi ; FALLBACK21-NEXT: movl %edi, %esi ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK21-NEXT: movl %esi, 4(%ebp) ; FALLBACK21-NEXT: movl %ebx, 24(%ebp) ; FALLBACK21-NEXT: shrdl %cl, %edi, %edx ; FALLBACK21-NEXT: shrl %cl, %eax ; FALLBACK21-NEXT: movl %eax, 28(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 16(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 20(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 8(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 12(%ebp) ; FALLBACK21-NEXT: movl %edx, (%ebp) ; FALLBACK21-NEXT: addl $108, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi ; FALLBACK21-NEXT: popl %ebx ; FALLBACK21-NEXT: popl %ebp ; FALLBACK21-NEXT: retl ; ; FALLBACK22-LABEL: lshr_32bytes: ; FALLBACK22: # %bb.0: ; FALLBACK22-NEXT: pushl %ebp ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi ; FALLBACK22-NEXT: subl $108, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movzbl (%eax), %ecx ; FALLBACK22-NEXT: movl %ecx, %edx ; FALLBACK22-NEXT: shlb $3, %dl ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: andb $28, %cl ; FALLBACK22-NEXT: movzbl %cl, %edi ; FALLBACK22-NEXT: shrxl %edx, 32(%esp,%edi), %ecx ; FALLBACK22-NEXT: movl %edx, %eax ; FALLBACK22-NEXT: notb %al ; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: addl %esi, %esi ; FALLBACK22-NEXT: shlxl %eax, %esi, %esi ; FALLBACK22-NEXT: orl %ecx, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: addl %ecx, %ecx ; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi ; FALLBACK22-NEXT: movl %eax, %ebp ; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx ; FALLBACK22-NEXT: shrxl %edx, %ecx, %ebx ; FALLBACK22-NEXT: orl %ebx, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: addl %ecx, %ecx ; FALLBACK22-NEXT: shlxl %eax, %ecx, %esi ; FALLBACK22-NEXT: movl 40(%esp,%edi), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, %eax, %ebx ; FALLBACK22-NEXT: orl %ebx, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %ebx ; FALLBACK22-NEXT: shlxl %ebp, %ebx, %eax ; FALLBACK22-NEXT: movl %ebp, %ecx ; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx ; FALLBACK22-NEXT: shrxl %edx, %ebx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %ebx, %ebx ; FALLBACK22-NEXT: shlxl %ecx, %ebx, %ebx ; FALLBACK22-NEXT: orl %ebp, %ebx ; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi ; FALLBACK22-NEXT: shrxl %edx, %edi, %eax ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: movl %ecx, %edx ; FALLBACK22-NEXT: shlxl %ecx, %edi, %edi ; FALLBACK22-NEXT: orl %ebp, %edi ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: addl %ecx, %ecx ; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx ; FALLBACK22-NEXT: orl %esi, %ecx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK22-NEXT: movl %eax, 28(%edx) ; FALLBACK22-NEXT: movl %ecx, 4(%edx) ; FALLBACK22-NEXT: movl %edi, 24(%edx) ; FALLBACK22-NEXT: movl %ebx, 16(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 12(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, (%edx) ; FALLBACK22-NEXT: addl $108, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx ; FALLBACK22-NEXT: popl %ebp ; FALLBACK22-NEXT: retl ; ; FALLBACK23-LABEL: lshr_32bytes: ; FALLBACK23: # %bb.0: ; FALLBACK23-NEXT: pushl %ebp ; FALLBACK23-NEXT: pushl %ebx ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi ; FALLBACK23-NEXT: subl $108, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK23-NEXT: movups (%ecx), %xmm0 ; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK23-NEXT: movzbl (%eax), %eax ; FALLBACK23-NEXT: movl %eax, %ecx ; FALLBACK23-NEXT: shlb $3, %cl ; FALLBACK23-NEXT: xorps %xmm2, %xmm2 ; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: andb $28, %al ; FALLBACK23-NEXT: movzbl %al, %ebx ; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi ; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %esi, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp ; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK23-NEXT: movl %eax, %edi ; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi ; FALLBACK23-NEXT: shrdl %cl, %eax, %esi ; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp ; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx ; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl %ebx, 4(%eax) ; FALLBACK23-NEXT: movl %ebp, 24(%eax) ; FALLBACK23-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK23-NEXT: movl %ebx, 28(%eax) ; FALLBACK23-NEXT: movl %esi, 16(%eax) ; FALLBACK23-NEXT: movl %edi, 20(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK23-NEXT: movl %esi, 8(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK23-NEXT: movl %esi, 12(%eax) ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK23-NEXT: shrdl %cl, %esi, %edx ; FALLBACK23-NEXT: movl %edx, (%eax) ; FALLBACK23-NEXT: addl $108, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi ; FALLBACK23-NEXT: popl %ebx ; FALLBACK23-NEXT: popl %ebp ; FALLBACK23-NEXT: retl ; ; FALLBACK24-LABEL: lshr_32bytes: ; FALLBACK24: # %bb.0: ; FALLBACK24-NEXT: pushl %ebp ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi ; FALLBACK24-NEXT: subl $108, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK24-NEXT: movzbl (%eax), %ecx ; FALLBACK24-NEXT: movl %ecx, %eax ; FALLBACK24-NEXT: shlb $3, %al ; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $28, %cl ; FALLBACK24-NEXT: movzbl %cl, %edi ; FALLBACK24-NEXT: movl 32(%esp,%edi), %esi ; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: movl %eax, %edx ; FALLBACK24-NEXT: notb %dl ; FALLBACK24-NEXT: addl %ebx, %ebx ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %esi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebp ; FALLBACK24-NEXT: movl %ebp, %esi ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: movl 48(%esp,%edi), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %esi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi ; FALLBACK24-NEXT: movl %esi, %ebx ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: addl %ebp, %ebp ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: orl %ebx, %ebp ; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp ; FALLBACK24-NEXT: movl %ebp, %ebx ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx ; FALLBACK24-NEXT: movl %ecx, (%esp) # 4-byte Spill ; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: orl %ebx, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: addl %ebp, %ebp ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: orl %edi, %ebp ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl 60(%esp,%ecx), %ebx ; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: orl (%esp), %edi # 4-byte Folded Reload ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl %ebx, 28(%eax) ; FALLBACK24-NEXT: movl %esi, 4(%eax) ; FALLBACK24-NEXT: movl %edi, 24(%eax) ; FALLBACK24-NEXT: movl %ebp, 16(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 20(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 8(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 12(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, (%eax) ; FALLBACK24-NEXT: addl $108, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx ; FALLBACK24-NEXT: popl %ebp ; FALLBACK24-NEXT: vzeroupper ; FALLBACK24-NEXT: retl ; ; FALLBACK25-LABEL: lshr_32bytes: ; FALLBACK25: # %bb.0: ; FALLBACK25-NEXT: pushl %ebp ; FALLBACK25-NEXT: pushl %ebx ; FALLBACK25-NEXT: pushl %edi ; FALLBACK25-NEXT: pushl %esi ; FALLBACK25-NEXT: subl $108, %esp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK25-NEXT: movzbl (%eax), %eax ; FALLBACK25-NEXT: movl %eax, %ecx ; FALLBACK25-NEXT: shlb $3, %cl ; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: andb $28, %al ; FALLBACK25-NEXT: movzbl %al, %ebp ; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi ; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx ; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi ; FALLBACK25-NEXT: movl %edi, %esi ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK25-NEXT: movl %esi, 4(%ebp) ; FALLBACK25-NEXT: movl %ebx, 24(%ebp) ; FALLBACK25-NEXT: shrdl %cl, %edi, %edx ; FALLBACK25-NEXT: shrl %cl, %eax ; FALLBACK25-NEXT: movl %eax, 28(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 16(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 20(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 8(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 12(%ebp) ; FALLBACK25-NEXT: movl %edx, (%ebp) ; FALLBACK25-NEXT: addl $108, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi ; FALLBACK25-NEXT: popl %ebx ; FALLBACK25-NEXT: popl %ebp ; FALLBACK25-NEXT: vzeroupper ; FALLBACK25-NEXT: retl ; ; FALLBACK26-LABEL: lshr_32bytes: ; FALLBACK26: # %bb.0: ; FALLBACK26-NEXT: pushl %ebp ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi ; FALLBACK26-NEXT: subl $108, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK26-NEXT: movzbl (%eax), %ecx ; FALLBACK26-NEXT: movl %ecx, %edx ; FALLBACK26-NEXT: shlb $3, %dl ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: andb $28, %cl ; FALLBACK26-NEXT: movzbl %cl, %edi ; FALLBACK26-NEXT: shrxl %edx, 32(%esp,%edi), %ecx ; FALLBACK26-NEXT: movl %edx, %eax ; FALLBACK26-NEXT: notb %al ; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: addl %esi, %esi ; FALLBACK26-NEXT: shlxl %eax, %esi, %esi ; FALLBACK26-NEXT: orl %ecx, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: addl %ecx, %ecx ; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi ; FALLBACK26-NEXT: movl %eax, %ebp ; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx ; FALLBACK26-NEXT: shrxl %edx, %ecx, %ebx ; FALLBACK26-NEXT: orl %ebx, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: addl %ecx, %ecx ; FALLBACK26-NEXT: shlxl %eax, %ecx, %esi ; FALLBACK26-NEXT: movl 40(%esp,%edi), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, %eax, %ebx ; FALLBACK26-NEXT: orl %ebx, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %ebx ; FALLBACK26-NEXT: shlxl %ebp, %ebx, %eax ; FALLBACK26-NEXT: movl %ebp, %ecx ; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx ; FALLBACK26-NEXT: shrxl %edx, %ebx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %ebx, %ebx ; FALLBACK26-NEXT: shlxl %ecx, %ebx, %ebx ; FALLBACK26-NEXT: orl %ebp, %ebx ; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi ; FALLBACK26-NEXT: shrxl %edx, %edi, %eax ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: movl %ecx, %edx ; FALLBACK26-NEXT: shlxl %ecx, %edi, %edi ; FALLBACK26-NEXT: orl %ebp, %edi ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: addl %ecx, %ecx ; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx ; FALLBACK26-NEXT: orl %esi, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK26-NEXT: movl %eax, 28(%edx) ; FALLBACK26-NEXT: movl %ecx, 4(%edx) ; FALLBACK26-NEXT: movl %edi, 24(%edx) ; FALLBACK26-NEXT: movl %ebx, 16(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 12(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, (%edx) ; FALLBACK26-NEXT: addl $108, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx ; FALLBACK26-NEXT: popl %ebp ; FALLBACK26-NEXT: vzeroupper ; FALLBACK26-NEXT: retl ; ; FALLBACK27-LABEL: lshr_32bytes: ; FALLBACK27: # %bb.0: ; FALLBACK27-NEXT: pushl %ebp ; FALLBACK27-NEXT: pushl %ebx ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi ; FALLBACK27-NEXT: subl $108, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK27-NEXT: movzbl (%eax), %eax ; FALLBACK27-NEXT: movl %eax, %ecx ; FALLBACK27-NEXT: shlb $3, %cl ; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: andb $28, %al ; FALLBACK27-NEXT: movzbl %al, %ebx ; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi ; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %esi, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp ; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK27-NEXT: movl %eax, %edi ; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi ; FALLBACK27-NEXT: shrdl %cl, %eax, %esi ; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp ; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx ; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl %ebx, 4(%eax) ; FALLBACK27-NEXT: movl %ebp, 24(%eax) ; FALLBACK27-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK27-NEXT: movl %ebx, 28(%eax) ; FALLBACK27-NEXT: movl %esi, 16(%eax) ; FALLBACK27-NEXT: movl %edi, 20(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK27-NEXT: movl %esi, 8(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK27-NEXT: movl %esi, 12(%eax) ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK27-NEXT: shrdl %cl, %esi, %edx ; FALLBACK27-NEXT: movl %edx, (%eax) ; FALLBACK27-NEXT: addl $108, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi ; FALLBACK27-NEXT: popl %ebx ; FALLBACK27-NEXT: popl %ebp ; FALLBACK27-NEXT: vzeroupper ; FALLBACK27-NEXT: retl ; ; FALLBACK28-LABEL: lshr_32bytes: ; FALLBACK28: # %bb.0: ; FALLBACK28-NEXT: pushl %ebp ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi ; FALLBACK28-NEXT: subl $108, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK28-NEXT: movzbl (%eax), %ecx ; FALLBACK28-NEXT: movl %ecx, %eax ; FALLBACK28-NEXT: shlb $3, %al ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $28, %cl ; FALLBACK28-NEXT: movzbl %cl, %edi ; FALLBACK28-NEXT: movl 32(%esp,%edi), %esi ; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: movl %eax, %edx ; FALLBACK28-NEXT: notb %dl ; FALLBACK28-NEXT: addl %ebx, %ebx ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %esi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebp ; FALLBACK28-NEXT: movl %ebp, %esi ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: movl 48(%esp,%edi), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %esi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi ; FALLBACK28-NEXT: movl %esi, %ebx ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: addl %ebp, %ebp ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: orl %ebx, %ebp ; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp ; FALLBACK28-NEXT: movl %ebp, %ebx ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx ; FALLBACK28-NEXT: movl %ecx, (%esp) # 4-byte Spill ; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: orl %ebx, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: addl %ebp, %ebp ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: orl %edi, %ebp ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, (%esp) # 4-byte Folded Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl 60(%esp,%ecx), %ebx ; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: orl (%esp), %edi # 4-byte Folded Reload ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl %ebx, 28(%eax) ; FALLBACK28-NEXT: movl %esi, 4(%eax) ; FALLBACK28-NEXT: movl %edi, 24(%eax) ; FALLBACK28-NEXT: movl %ebp, 16(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 20(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 8(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 12(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, (%eax) ; FALLBACK28-NEXT: addl $108, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx ; FALLBACK28-NEXT: popl %ebp ; FALLBACK28-NEXT: vzeroupper ; FALLBACK28-NEXT: retl ; ; FALLBACK29-LABEL: lshr_32bytes: ; FALLBACK29: # %bb.0: ; FALLBACK29-NEXT: pushl %ebp ; FALLBACK29-NEXT: pushl %ebx ; FALLBACK29-NEXT: pushl %edi ; FALLBACK29-NEXT: pushl %esi ; FALLBACK29-NEXT: subl $108, %esp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK29-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK29-NEXT: movzbl (%eax), %eax ; FALLBACK29-NEXT: movl %eax, %ecx ; FALLBACK29-NEXT: shlb $3, %cl ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: andb $28, %al ; FALLBACK29-NEXT: movzbl %al, %ebp ; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi ; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx ; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi ; FALLBACK29-NEXT: movl %edi, %esi ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK29-NEXT: movl %esi, 4(%ebp) ; FALLBACK29-NEXT: movl %ebx, 24(%ebp) ; FALLBACK29-NEXT: shrdl %cl, %edi, %edx ; FALLBACK29-NEXT: shrl %cl, %eax ; FALLBACK29-NEXT: movl %eax, 28(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 16(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 20(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 8(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 12(%ebp) ; FALLBACK29-NEXT: movl %edx, (%ebp) ; FALLBACK29-NEXT: addl $108, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi ; FALLBACK29-NEXT: popl %ebx ; FALLBACK29-NEXT: popl %ebp ; FALLBACK29-NEXT: vzeroupper ; FALLBACK29-NEXT: retl ; ; FALLBACK30-LABEL: lshr_32bytes: ; FALLBACK30: # %bb.0: ; FALLBACK30-NEXT: pushl %ebp ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi ; FALLBACK30-NEXT: subl $108, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK30-NEXT: movzbl (%eax), %ecx ; FALLBACK30-NEXT: movl %ecx, %edx ; FALLBACK30-NEXT: shlb $3, %dl ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: andb $28, %cl ; FALLBACK30-NEXT: movzbl %cl, %edi ; FALLBACK30-NEXT: shrxl %edx, 32(%esp,%edi), %ecx ; FALLBACK30-NEXT: movl %edx, %eax ; FALLBACK30-NEXT: notb %al ; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: addl %esi, %esi ; FALLBACK30-NEXT: shlxl %eax, %esi, %esi ; FALLBACK30-NEXT: orl %ecx, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: addl %ecx, %ecx ; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi ; FALLBACK30-NEXT: movl %eax, %ebp ; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx ; FALLBACK30-NEXT: shrxl %edx, %ecx, %ebx ; FALLBACK30-NEXT: orl %ebx, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: addl %ecx, %ecx ; FALLBACK30-NEXT: shlxl %eax, %ecx, %esi ; FALLBACK30-NEXT: movl 40(%esp,%edi), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, %eax, %ebx ; FALLBACK30-NEXT: orl %ebx, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %ebx ; FALLBACK30-NEXT: shlxl %ebp, %ebx, %eax ; FALLBACK30-NEXT: movl %ebp, %ecx ; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx ; FALLBACK30-NEXT: shrxl %edx, %ebx, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %ebx, %ebx ; FALLBACK30-NEXT: shlxl %ecx, %ebx, %ebx ; FALLBACK30-NEXT: orl %ebp, %ebx ; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi ; FALLBACK30-NEXT: shrxl %edx, %edi, %eax ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: movl %ecx, %edx ; FALLBACK30-NEXT: shlxl %ecx, %edi, %edi ; FALLBACK30-NEXT: orl %ebp, %edi ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: addl %ecx, %ecx ; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx ; FALLBACK30-NEXT: orl %esi, %ecx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK30-NEXT: movl %eax, 28(%edx) ; FALLBACK30-NEXT: movl %ecx, 4(%edx) ; FALLBACK30-NEXT: movl %edi, 24(%edx) ; FALLBACK30-NEXT: movl %ebx, 16(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 12(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, (%edx) ; FALLBACK30-NEXT: addl $108, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx ; FALLBACK30-NEXT: popl %ebp ; FALLBACK30-NEXT: vzeroupper ; FALLBACK30-NEXT: retl ; ; FALLBACK31-LABEL: lshr_32bytes: ; FALLBACK31: # %bb.0: ; FALLBACK31-NEXT: pushl %ebp ; FALLBACK31-NEXT: pushl %ebx ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi ; FALLBACK31-NEXT: subl $108, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK31-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK31-NEXT: movzbl (%eax), %eax ; FALLBACK31-NEXT: movl %eax, %ecx ; FALLBACK31-NEXT: shlb $3, %cl ; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: andb $28, %al ; FALLBACK31-NEXT: movzbl %al, %ebx ; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi ; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %esi, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp ; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK31-NEXT: movl %eax, %edi ; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi ; FALLBACK31-NEXT: shrdl %cl, %eax, %esi ; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp ; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx ; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl %ebx, 4(%eax) ; FALLBACK31-NEXT: movl %ebp, 24(%eax) ; FALLBACK31-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK31-NEXT: movl %ebx, 28(%eax) ; FALLBACK31-NEXT: movl %esi, 16(%eax) ; FALLBACK31-NEXT: movl %edi, 20(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK31-NEXT: movl %esi, 8(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK31-NEXT: movl %esi, 12(%eax) ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK31-NEXT: shrdl %cl, %esi, %edx ; FALLBACK31-NEXT: movl %edx, (%eax) ; FALLBACK31-NEXT: addl $108, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi ; FALLBACK31-NEXT: popl %ebx ; FALLBACK31-NEXT: popl %ebp ; FALLBACK31-NEXT: vzeroupper ; FALLBACK31-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 %res = lshr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @lshr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: lshr_32bytes_dwordOff: ; FALLBACK0: # %bb.0: ; FALLBACK0-NEXT: pushq %rbx ; FALLBACK0-NEXT: movq (%rdi), %rcx ; FALLBACK0-NEXT: movq 8(%rdi), %r8 ; FALLBACK0-NEXT: movq 16(%rdi), %r9 ; FALLBACK0-NEXT: movq 24(%rdi), %rdi ; FALLBACK0-NEXT: movzbl (%rsi), %esi ; FALLBACK0-NEXT: movl %esi, %eax ; FALLBACK0-NEXT: shlb $5, %al ; FALLBACK0-NEXT: xorps %xmm0, %xmm0 ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: andb $6, %sil ; FALLBACK0-NEXT: movzbl %sil, %r9d ; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 ; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi ; FALLBACK0-NEXT: movq %rdi, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx ; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 ; FALLBACK0-NEXT: orq %r11, %r8 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r10 ; FALLBACK0-NEXT: addq %rdi, %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rdi ; FALLBACK0-NEXT: orq %r10, %rdi ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx ; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 ; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: orq %rbx, %r10 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r9 ; FALLBACK0-NEXT: movq %r9, 24(%rdx) ; FALLBACK0-NEXT: movq %r10, 16(%rdx) ; FALLBACK0-NEXT: movq %rdi, (%rdx) ; FALLBACK0-NEXT: movq %r8, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: lshr_32bytes_dwordOff: ; FALLBACK1: # %bb.0: ; FALLBACK1-NEXT: movq (%rdi), %rax ; FALLBACK1-NEXT: movq 8(%rdi), %r8 ; FALLBACK1-NEXT: movq 16(%rdi), %r9 ; FALLBACK1-NEXT: movq 24(%rdi), %rdi ; FALLBACK1-NEXT: movzbl (%rsi), %esi ; FALLBACK1-NEXT: movl %esi, %ecx ; FALLBACK1-NEXT: shlb $5, %cl ; FALLBACK1-NEXT: xorps %xmm0, %xmm0 ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: andb $6, %sil ; FALLBACK1-NEXT: movzbl %sil, %eax ; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi ; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi ; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 ; FALLBACK1-NEXT: movq %r8, %r9 ; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 ; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax ; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi ; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi ; FALLBACK1-NEXT: shrq %cl, %rax ; FALLBACK1-NEXT: movq %rsi, 16(%rdx) ; FALLBACK1-NEXT: movq %rax, 24(%rdx) ; FALLBACK1-NEXT: movq %rdi, (%rdx) ; FALLBACK1-NEXT: movq %r9, 8(%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: lshr_32bytes_dwordOff: ; FALLBACK2: # %bb.0: ; FALLBACK2-NEXT: movq (%rdi), %rcx ; FALLBACK2-NEXT: movq 8(%rdi), %r8 ; FALLBACK2-NEXT: movq 16(%rdi), %r9 ; FALLBACK2-NEXT: movq 24(%rdi), %rdi ; FALLBACK2-NEXT: movzbl (%rsi), %esi ; FALLBACK2-NEXT: movl %esi, %eax ; FALLBACK2-NEXT: shlb $5, %al ; FALLBACK2-NEXT: xorps %xmm0, %xmm0 ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: andb $6, %sil ; FALLBACK2-NEXT: movzbl %sil, %ecx ; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi ; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi ; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 ; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 ; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx ; FALLBACK2-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK2-NEXT: orq %r8, %rdi ; FALLBACK2-NEXT: addq %rsi, %rsi ; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi ; FALLBACK2-NEXT: orq %r9, %rsi ; FALLBACK2-NEXT: addq %rcx, %rcx ; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) ; FALLBACK2-NEXT: movq %rsi, (%rdx) ; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_32bytes_dwordOff: ; FALLBACK3: # %bb.0: ; FALLBACK3-NEXT: movq (%rdi), %rax ; FALLBACK3-NEXT: movq 8(%rdi), %r8 ; FALLBACK3-NEXT: movq 16(%rdi), %r9 ; FALLBACK3-NEXT: movq 24(%rdi), %rdi ; FALLBACK3-NEXT: movzbl (%rsi), %esi ; FALLBACK3-NEXT: movl %esi, %ecx ; FALLBACK3-NEXT: shlb $5, %cl ; FALLBACK3-NEXT: xorps %xmm0, %xmm0 ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: andb $6, %sil ; FALLBACK3-NEXT: movzbl %sil, %eax ; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi ; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi ; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 ; FALLBACK3-NEXT: movq %r8, %r9 ; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 ; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax ; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi ; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi ; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax ; FALLBACK3-NEXT: movq %rsi, 16(%rdx) ; FALLBACK3-NEXT: movq %rax, 24(%rdx) ; FALLBACK3-NEXT: movq %rdi, (%rdx) ; FALLBACK3-NEXT: movq %r9, 8(%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: lshr_32bytes_dwordOff: ; FALLBACK4: # %bb.0: ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movzbl (%rsi), %ecx ; FALLBACK4-NEXT: movl %ecx, %eax ; FALLBACK4-NEXT: shlb $5, %al ; FALLBACK4-NEXT: xorps %xmm2, %xmm2 ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: andb $6, %cl ; FALLBACK4-NEXT: movzbl %cl, %r9d ; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 ; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi ; FALLBACK4-NEXT: orq %r10, %rdi ; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 ; FALLBACK4-NEXT: movq %r10, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 ; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 ; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rbx ; FALLBACK4-NEXT: orq %r11, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 ; FALLBACK4-NEXT: addq %r10, %r10 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: orq %r8, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, 24(%rdx) ; FALLBACK4-NEXT: movq %r10, 8(%rdx) ; FALLBACK4-NEXT: movq %rbx, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: lshr_32bytes_dwordOff: ; FALLBACK5: # %bb.0: ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK5-NEXT: movzbl (%rsi), %eax ; FALLBACK5-NEXT: movl %eax, %ecx ; FALLBACK5-NEXT: shlb $5, %cl ; FALLBACK5-NEXT: xorps %xmm2, %xmm2 ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: andb $6, %al ; FALLBACK5-NEXT: movzbl %al, %eax ; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK5-NEXT: movq %rdi, %r8 ; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK5-NEXT: movq %rax, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK5-NEXT: shrq %cl, %rsi ; FALLBACK5-NEXT: movq %r10, 8(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) ; FALLBACK5-NEXT: movq %r9, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: lshr_32bytes_dwordOff: ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movzbl (%rsi), %ecx ; FALLBACK6-NEXT: movl %ecx, %eax ; FALLBACK6-NEXT: shlb $5, %al ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: andb $6, %cl ; FALLBACK6-NEXT: movzbl %cl, %ecx ; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi ; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi ; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 ; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx ; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK6-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK6-NEXT: notb %al ; FALLBACK6-NEXT: addq %rdi, %rdi ; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %rsi, %rdi ; FALLBACK6-NEXT: addq %rcx, %rcx ; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK6-NEXT: orq %r9, %rcx ; FALLBACK6-NEXT: addq %r8, %r8 ; FALLBACK6-NEXT: shlxq %rax, %r8, %rax ; FALLBACK6-NEXT: orq %r10, %rax ; FALLBACK6-NEXT: movq %r11, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) ; FALLBACK6-NEXT: movq %rcx, 16(%rdx) ; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_32bytes_dwordOff: ; FALLBACK7: # %bb.0: ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK7-NEXT: movzbl (%rsi), %eax ; FALLBACK7-NEXT: movl %eax, %ecx ; FALLBACK7-NEXT: shlb $5, %cl ; FALLBACK7-NEXT: xorps %xmm2, %xmm2 ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: andb $6, %al ; FALLBACK7-NEXT: movzbl %al, %eax ; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK7-NEXT: movq %rdi, %r8 ; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK7-NEXT: movq %rax, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK7-NEXT: shrxq %rcx, %rsi, %rax ; FALLBACK7-NEXT: movq %r10, 8(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rax, 24(%rdx) ; FALLBACK7-NEXT: movq %r9, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: lshr_32bytes_dwordOff: ; FALLBACK8: # %bb.0: ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: movzbl (%rsi), %ecx ; FALLBACK8-NEXT: movl %ecx, %eax ; FALLBACK8-NEXT: shlb $5, %al ; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: andb $6, %cl ; FALLBACK8-NEXT: movzbl %cl, %r9d ; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 ; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi ; FALLBACK8-NEXT: orq %r10, %rdi ; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 ; FALLBACK8-NEXT: movq %r10, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 ; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 ; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rbx ; FALLBACK8-NEXT: orq %r11, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 ; FALLBACK8-NEXT: addq %r10, %r10 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: orq %r8, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, 24(%rdx) ; FALLBACK8-NEXT: movq %r10, 8(%rdx) ; FALLBACK8-NEXT: movq %rbx, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: lshr_32bytes_dwordOff: ; FALLBACK9: # %bb.0: ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: movzbl (%rsi), %eax ; FALLBACK9-NEXT: movl %eax, %ecx ; FALLBACK9-NEXT: shlb $5, %cl ; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: andb $6, %al ; FALLBACK9-NEXT: movzbl %al, %eax ; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK9-NEXT: movq %rdi, %r8 ; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK9-NEXT: movq %rax, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK9-NEXT: shrq %cl, %rsi ; FALLBACK9-NEXT: movq %r10, 8(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) ; FALLBACK9-NEXT: movq %r9, (%rdx) ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: lshr_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: movzbl (%rsi), %ecx ; FALLBACK10-NEXT: movl %ecx, %eax ; FALLBACK10-NEXT: shlb $5, %al ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: andb $6, %cl ; FALLBACK10-NEXT: movzbl %cl, %ecx ; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi ; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi ; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 ; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx ; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK10-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK10-NEXT: notb %al ; FALLBACK10-NEXT: addq %rdi, %rdi ; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %rsi, %rdi ; FALLBACK10-NEXT: addq %rcx, %rcx ; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK10-NEXT: orq %r9, %rcx ; FALLBACK10-NEXT: addq %r8, %r8 ; FALLBACK10-NEXT: shlxq %rax, %r8, %rax ; FALLBACK10-NEXT: orq %r10, %rax ; FALLBACK10-NEXT: movq %r11, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) ; FALLBACK10-NEXT: movq %rcx, 16(%rdx) ; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: lshr_32bytes_dwordOff: ; FALLBACK11: # %bb.0: ; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK11-NEXT: movzbl (%rsi), %eax ; FALLBACK11-NEXT: movl %eax, %ecx ; FALLBACK11-NEXT: shlb $5, %cl ; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: andb $6, %al ; FALLBACK11-NEXT: movzbl %al, %eax ; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK11-NEXT: movq %rdi, %r8 ; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK11-NEXT: movq %rax, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK11-NEXT: shrxq %rcx, %rsi, %rax ; FALLBACK11-NEXT: movq %r10, 8(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) ; FALLBACK11-NEXT: movq %rax, 24(%rdx) ; FALLBACK11-NEXT: movq %r9, (%rdx) ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: lshr_32bytes_dwordOff: ; FALLBACK12: # %bb.0: ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK12-NEXT: movzbl (%rsi), %ecx ; FALLBACK12-NEXT: movl %ecx, %eax ; FALLBACK12-NEXT: shlb $5, %al ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: andb $6, %cl ; FALLBACK12-NEXT: movzbl %cl, %r9d ; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 ; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi ; FALLBACK12-NEXT: orq %r10, %rdi ; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 ; FALLBACK12-NEXT: movq %r10, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 ; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 ; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r11, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 ; FALLBACK12-NEXT: addq %r10, %r10 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: orq %r8, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 24(%rdx) ; FALLBACK12-NEXT: movq %r10, 8(%rdx) ; FALLBACK12-NEXT: movq %rbx, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: lshr_32bytes_dwordOff: ; FALLBACK13: # %bb.0: ; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK13-NEXT: movzbl (%rsi), %eax ; FALLBACK13-NEXT: movl %eax, %ecx ; FALLBACK13-NEXT: shlb $5, %cl ; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: andb $6, %al ; FALLBACK13-NEXT: movzbl %al, %eax ; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK13-NEXT: movq %rdi, %r8 ; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK13-NEXT: movq %rax, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK13-NEXT: shrq %cl, %rsi ; FALLBACK13-NEXT: movq %r10, 8(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) ; FALLBACK13-NEXT: movq %r9, (%rdx) ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: lshr_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK14-NEXT: movzbl (%rsi), %ecx ; FALLBACK14-NEXT: movl %ecx, %eax ; FALLBACK14-NEXT: shlb $5, %al ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: andb $6, %cl ; FALLBACK14-NEXT: movzbl %cl, %ecx ; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi ; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi ; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 ; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx ; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK14-NEXT: shrxq %rax, %rcx, %r11 ; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK14-NEXT: notb %al ; FALLBACK14-NEXT: addq %rdi, %rdi ; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %rsi, %rdi ; FALLBACK14-NEXT: addq %rcx, %rcx ; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK14-NEXT: orq %r9, %rcx ; FALLBACK14-NEXT: addq %r8, %r8 ; FALLBACK14-NEXT: shlxq %rax, %r8, %rax ; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: movq %r11, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) ; FALLBACK14-NEXT: movq %rcx, 16(%rdx) ; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: lshr_32bytes_dwordOff: ; FALLBACK15: # %bb.0: ; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK15-NEXT: movzbl (%rsi), %eax ; FALLBACK15-NEXT: movl %eax, %ecx ; FALLBACK15-NEXT: shlb $5, %cl ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: andb $6, %al ; FALLBACK15-NEXT: movzbl %al, %eax ; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK15-NEXT: movq %rdi, %r8 ; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK15-NEXT: movq %rax, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK15-NEXT: shrxq %rcx, %rsi, %rax ; FALLBACK15-NEXT: movq %r10, 8(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) ; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r9, (%rdx) ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; ; X86-SSE2-LABEL: lshr_32bytes_dwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $92, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%eax), %esi ; X86-SSE2-NEXT: movl 12(%eax), %edi ; X86-SSE2-NEXT: movl 16(%eax), %ebx ; X86-SSE2-NEXT: movl 20(%eax), %ebp ; X86-SSE2-NEXT: movl 24(%eax), %edx ; X86-SSE2-NEXT: movl 28(%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movzbl (%eax), %eax ; X86-SSE2-NEXT: xorps %xmm0, %xmm0 ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andl $7, %eax ; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi ; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi ; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx ; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp ; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx ; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl %edx, 28(%eax) ; X86-SSE2-NEXT: movl %ebp, 16(%eax) ; X86-SSE2-NEXT: movl %ebx, 20(%eax) ; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: addl $92, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: lshr_32bytes_dwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: subl $76, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm2, %xmm2 ; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, (%esp) ; X86-SSE42-NEXT: andl $7, %ecx ; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0 ; X86-SSE42-NEXT: movups 16(%esp,%ecx,4), %xmm1 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $76, %esp ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: lshr_32bytes_dwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: subl $76, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %ymm0 ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovups %ymm0, (%esp) ; X86-AVX-NEXT: andl $7, %ecx ; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0 ; X86-AVX-NEXT: vmovups 16(%esp,%ecx,4), %xmm1 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $76, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 %bitOff = shl i256 %dwordOff, 5 %res = lshr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @lshr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: lshr_32bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movq (%rdi), %rax ; X64-SSE2-NEXT: movq 8(%rdi), %rcx ; X64-SSE2-NEXT: movq 16(%rdi), %r8 ; X64-SSE2-NEXT: movq 24(%rdi), %rdi ; X64-SSE2-NEXT: movzbl (%rsi), %esi ; X64-SSE2-NEXT: xorps %xmm0, %xmm0 ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: andl $3, %esi ; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax ; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx ; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi ; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi ; X64-SSE2-NEXT: movq %rsi, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) ; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: lshr_32bytes_qwordOff: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movzbl (%rsi), %eax ; X64-SSE42-NEXT: xorps %xmm2, %xmm2 ; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: andl $3, %eax ; X64-SSE42-NEXT: movups -72(%rsp,%rax,8), %xmm0 ; X64-SSE42-NEXT: movups -56(%rsp,%rax,8), %xmm1 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: lshr_32bytes_qwordOff: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: movzbl (%rsi), %eax ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $3, %eax ; X64-AVX-NEXT: vmovups -72(%rsp,%rax,8), %xmm0 ; X64-AVX-NEXT: vmovups -56(%rsp,%rax,8), %xmm1 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; ; X86-SSE2-LABEL: lshr_32bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $92, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%eax), %esi ; X86-SSE2-NEXT: movl 12(%eax), %edi ; X86-SSE2-NEXT: movl 16(%eax), %ebx ; X86-SSE2-NEXT: movl 20(%eax), %ebp ; X86-SSE2-NEXT: movl 24(%eax), %edx ; X86-SSE2-NEXT: movl 28(%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movzbl (%eax), %eax ; X86-SSE2-NEXT: xorps %xmm0, %xmm0 ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andl $3, %eax ; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi ; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi ; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx ; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp ; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx ; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl %edx, 28(%eax) ; X86-SSE2-NEXT: movl %ebp, 16(%eax) ; X86-SSE2-NEXT: movl %ebx, 20(%eax) ; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: addl $92, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: lshr_32bytes_qwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: subl $76, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm2, %xmm2 ; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, (%esp) ; X86-SSE42-NEXT: andl $3, %ecx ; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0 ; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $76, %esp ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: lshr_32bytes_qwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: subl $76, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %ymm0 ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovups %ymm0, (%esp) ; X86-AVX-NEXT: andl $3, %ecx ; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0 ; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $76, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %qwordOff = load i256, ptr %qwordOff.ptr, align 1 %bitOff = shl i256 %qwordOff, 6 %res = lshr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: shl_32bytes: ; FALLBACK0: # %bb.0: ; FALLBACK0-NEXT: pushq %rbx ; FALLBACK0-NEXT: movq (%rdi), %rcx ; FALLBACK0-NEXT: movq 8(%rdi), %r8 ; FALLBACK0-NEXT: movq 16(%rdi), %r9 ; FALLBACK0-NEXT: movq 24(%rdi), %rdi ; FALLBACK0-NEXT: movzbl (%rsi), %esi ; FALLBACK0-NEXT: leal (,%rsi,8), %eax ; FALLBACK0-NEXT: xorps %xmm0, %xmm0 ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: andb $24, %sil ; FALLBACK0-NEXT: negb %sil ; FALLBACK0-NEXT: movsbq %sil, %r10 ; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 ; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi ; FALLBACK0-NEXT: movq %rdi, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq %r8, %r9 ; FALLBACK0-NEXT: shrq %r9 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r9 ; FALLBACK0-NEXT: orq %r11, %r9 ; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r11 ; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 ; FALLBACK0-NEXT: movq %r10, %rbx ; FALLBACK0-NEXT: shrq %rbx ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx ; FALLBACK0-NEXT: orq %r11, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: shrq %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %rdi ; FALLBACK0-NEXT: orq %r10, %rdi ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 ; FALLBACK0-NEXT: movq %r8, (%rdx) ; FALLBACK0-NEXT: movq %rdi, 16(%rdx) ; FALLBACK0-NEXT: movq %rbx, 24(%rdx) ; FALLBACK0-NEXT: movq %r9, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: shl_32bytes: ; FALLBACK1: # %bb.0: ; FALLBACK1-NEXT: movq (%rdi), %rax ; FALLBACK1-NEXT: movq 8(%rdi), %r8 ; FALLBACK1-NEXT: movq 16(%rdi), %r9 ; FALLBACK1-NEXT: movq 24(%rdi), %rdi ; FALLBACK1-NEXT: movzbl (%rsi), %esi ; FALLBACK1-NEXT: leal (,%rsi,8), %ecx ; FALLBACK1-NEXT: xorps %xmm0, %xmm0 ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: andb $24, %sil ; FALLBACK1-NEXT: negb %sil ; FALLBACK1-NEXT: movsbq %sil, %rax ; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK1-NEXT: shldq %cl, %rax, %rsi ; FALLBACK1-NEXT: shldq %cl, %r8, %rax ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK1-NEXT: shlq %cl, %r8 ; FALLBACK1-NEXT: movq %rsi, 16(%rdx) ; FALLBACK1-NEXT: movq %rdi, 24(%rdx) ; FALLBACK1-NEXT: movq %r8, (%rdx) ; FALLBACK1-NEXT: movq %rax, 8(%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: shl_32bytes: ; FALLBACK2: # %bb.0: ; FALLBACK2-NEXT: movq (%rdi), %rcx ; FALLBACK2-NEXT: movq 8(%rdi), %r8 ; FALLBACK2-NEXT: movq 16(%rdi), %r9 ; FALLBACK2-NEXT: movq 24(%rdi), %rdi ; FALLBACK2-NEXT: movzbl (%rsi), %esi ; FALLBACK2-NEXT: leal (,%rsi,8), %eax ; FALLBACK2-NEXT: xorps %xmm0, %xmm0 ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil ; FALLBACK2-NEXT: movsbq %sil, %rsi ; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi ; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx ; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 ; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi ; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 ; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK2-NEXT: orq %r8, %rdi ; FALLBACK2-NEXT: shrq %rsi ; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi ; FALLBACK2-NEXT: orq %r9, %rsi ; FALLBACK2-NEXT: shrq %rcx ; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) ; FALLBACK2-NEXT: movq %rsi, 24(%rdx) ; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes: ; FALLBACK3: # %bb.0: ; FALLBACK3-NEXT: movq (%rdi), %rax ; FALLBACK3-NEXT: movq 8(%rdi), %r8 ; FALLBACK3-NEXT: movq 16(%rdi), %r9 ; FALLBACK3-NEXT: movq 24(%rdi), %rdi ; FALLBACK3-NEXT: movzbl (%rsi), %esi ; FALLBACK3-NEXT: leal (,%rsi,8), %ecx ; FALLBACK3-NEXT: xorps %xmm0, %xmm0 ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: andb $24, %sil ; FALLBACK3-NEXT: negb %sil ; FALLBACK3-NEXT: movsbq %sil, %rax ; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK3-NEXT: shldq %cl, %rax, %rsi ; FALLBACK3-NEXT: shldq %cl, %r8, %rax ; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx ; FALLBACK3-NEXT: movq %rsi, 16(%rdx) ; FALLBACK3-NEXT: movq %rdi, 24(%rdx) ; FALLBACK3-NEXT: movq %rcx, (%rdx) ; FALLBACK3-NEXT: movq %rax, 8(%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: shl_32bytes: ; FALLBACK4: # %bb.0: ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movzbl (%rsi), %ecx ; FALLBACK4-NEXT: leal (,%rcx,8), %eax ; FALLBACK4-NEXT: xorps %xmm2, %xmm2 ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: andb $24, %cl ; FALLBACK4-NEXT: negb %cl ; FALLBACK4-NEXT: movsbq %cl, %r8 ; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r9 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10 ; FALLBACK4-NEXT: movq %r10, %rdi ; FALLBACK4-NEXT: shrq %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %rdi ; FALLBACK4-NEXT: orq %r9, %rdi ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9 ; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8 ; FALLBACK4-NEXT: movq %r8, %r11 ; FALLBACK4-NEXT: shrq %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 ; FALLBACK4-NEXT: orq %r10, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r8 ; FALLBACK4-NEXT: movq %r9, %r10 ; FALLBACK4-NEXT: shrq %r10 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: orq %r8, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, (%rdx) ; FALLBACK4-NEXT: movq %r10, 8(%rdx) ; FALLBACK4-NEXT: movq %r11, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, 24(%rdx) ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: shl_32bytes: ; FALLBACK5: # %bb.0: ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK5-NEXT: movzbl (%rsi), %eax ; FALLBACK5-NEXT: leal (,%rax,8), %ecx ; FALLBACK5-NEXT: xorps %xmm2, %xmm2 ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: andb $24, %al ; FALLBACK5-NEXT: negb %al ; FALLBACK5-NEXT: movsbq %al, %rax ; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK5-NEXT: shldq %cl, %rax, %rsi ; FALLBACK5-NEXT: movq %r8, %r9 ; FALLBACK5-NEXT: shlq %cl, %r9 ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: shldq %cl, %r8, %rax ; FALLBACK5-NEXT: movq %rax, 8(%rdx) ; FALLBACK5-NEXT: movq %rsi, 16(%rdx) ; FALLBACK5-NEXT: movq %rdi, 24(%rdx) ; FALLBACK5-NEXT: movq %r9, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: shl_32bytes: ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movzbl (%rsi), %ecx ; FALLBACK6-NEXT: leal (,%rcx,8), %eax ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: andb $24, %cl ; FALLBACK6-NEXT: negb %cl ; FALLBACK6-NEXT: movsbq %cl, %rcx ; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi ; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi ; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 ; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 ; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx ; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 ; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 ; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK6-NEXT: notb %al ; FALLBACK6-NEXT: shrq %rdi ; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %rsi, %rdi ; FALLBACK6-NEXT: shrq %rcx ; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx ; FALLBACK6-NEXT: orq %r8, %rcx ; FALLBACK6-NEXT: shrq %r9 ; FALLBACK6-NEXT: shrxq %rax, %r9, %rax ; FALLBACK6-NEXT: orq %r10, %rax ; FALLBACK6-NEXT: movq %r11, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) ; FALLBACK6-NEXT: movq %rcx, 16(%rdx) ; FALLBACK6-NEXT: movq %rdi, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes: ; FALLBACK7: # %bb.0: ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK7-NEXT: movzbl (%rsi), %eax ; FALLBACK7-NEXT: leal (,%rax,8), %ecx ; FALLBACK7-NEXT: xorps %xmm2, %xmm2 ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: andb $24, %al ; FALLBACK7-NEXT: negb %al ; FALLBACK7-NEXT: movsbq %al, %rax ; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK7-NEXT: shldq %cl, %rax, %rsi ; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK7-NEXT: shldq %cl, %r8, %rax ; FALLBACK7-NEXT: movq %rax, 8(%rdx) ; FALLBACK7-NEXT: movq %rsi, 16(%rdx) ; FALLBACK7-NEXT: movq %rdi, 24(%rdx) ; FALLBACK7-NEXT: movq %r9, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: shl_32bytes: ; FALLBACK8: # %bb.0: ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: movzbl (%rsi), %ecx ; FALLBACK8-NEXT: leal (,%rcx,8), %eax ; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: andb $24, %cl ; FALLBACK8-NEXT: negb %cl ; FALLBACK8-NEXT: movsbq %cl, %r8 ; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r9 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10 ; FALLBACK8-NEXT: movq %r10, %rdi ; FALLBACK8-NEXT: shrq %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %rdi ; FALLBACK8-NEXT: orq %r9, %rdi ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9 ; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8 ; FALLBACK8-NEXT: movq %r8, %r11 ; FALLBACK8-NEXT: shrq %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 ; FALLBACK8-NEXT: orq %r10, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r8 ; FALLBACK8-NEXT: movq %r9, %r10 ; FALLBACK8-NEXT: shrq %r10 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: orq %r8, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, (%rdx) ; FALLBACK8-NEXT: movq %r10, 8(%rdx) ; FALLBACK8-NEXT: movq %r11, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, 24(%rdx) ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: shl_32bytes: ; FALLBACK9: # %bb.0: ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: movzbl (%rsi), %eax ; FALLBACK9-NEXT: leal (,%rax,8), %ecx ; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: andb $24, %al ; FALLBACK9-NEXT: negb %al ; FALLBACK9-NEXT: movsbq %al, %rax ; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK9-NEXT: shldq %cl, %rax, %rsi ; FALLBACK9-NEXT: movq %r8, %r9 ; FALLBACK9-NEXT: shlq %cl, %r9 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shldq %cl, %r8, %rax ; FALLBACK9-NEXT: movq %rax, 8(%rdx) ; FALLBACK9-NEXT: movq %rsi, 16(%rdx) ; FALLBACK9-NEXT: movq %rdi, 24(%rdx) ; FALLBACK9-NEXT: movq %r9, (%rdx) ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: shl_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: movzbl (%rsi), %ecx ; FALLBACK10-NEXT: leal (,%rcx,8), %eax ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: andb $24, %cl ; FALLBACK10-NEXT: negb %cl ; FALLBACK10-NEXT: movsbq %cl, %rcx ; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi ; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi ; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 ; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 ; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx ; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 ; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 ; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK10-NEXT: notb %al ; FALLBACK10-NEXT: shrq %rdi ; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %rsi, %rdi ; FALLBACK10-NEXT: shrq %rcx ; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx ; FALLBACK10-NEXT: orq %r8, %rcx ; FALLBACK10-NEXT: shrq %r9 ; FALLBACK10-NEXT: shrxq %rax, %r9, %rax ; FALLBACK10-NEXT: orq %r10, %rax ; FALLBACK10-NEXT: movq %r11, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) ; FALLBACK10-NEXT: movq %rcx, 16(%rdx) ; FALLBACK10-NEXT: movq %rdi, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: shl_32bytes: ; FALLBACK11: # %bb.0: ; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK11-NEXT: movzbl (%rsi), %eax ; FALLBACK11-NEXT: leal (,%rax,8), %ecx ; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: andb $24, %al ; FALLBACK11-NEXT: negb %al ; FALLBACK11-NEXT: movsbq %al, %rax ; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK11-NEXT: shldq %cl, %rax, %rsi ; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK11-NEXT: shldq %cl, %r8, %rax ; FALLBACK11-NEXT: movq %rax, 8(%rdx) ; FALLBACK11-NEXT: movq %rsi, 16(%rdx) ; FALLBACK11-NEXT: movq %rdi, 24(%rdx) ; FALLBACK11-NEXT: movq %r9, (%rdx) ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: shl_32bytes: ; FALLBACK12: # %bb.0: ; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK12-NEXT: movzbl (%rsi), %ecx ; FALLBACK12-NEXT: leal (,%rcx,8), %eax ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: andb $24, %cl ; FALLBACK12-NEXT: negb %cl ; FALLBACK12-NEXT: movsbq %cl, %r8 ; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r9 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10 ; FALLBACK12-NEXT: movq %r10, %rdi ; FALLBACK12-NEXT: shrq %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %rdi ; FALLBACK12-NEXT: orq %r9, %rdi ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9 ; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8 ; FALLBACK12-NEXT: movq %r8, %r11 ; FALLBACK12-NEXT: shrq %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 ; FALLBACK12-NEXT: orq %r10, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r8 ; FALLBACK12-NEXT: movq %r9, %r10 ; FALLBACK12-NEXT: shrq %r10 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: orq %r8, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, (%rdx) ; FALLBACK12-NEXT: movq %r10, 8(%rdx) ; FALLBACK12-NEXT: movq %r11, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, 24(%rdx) ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: shl_32bytes: ; FALLBACK13: # %bb.0: ; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK13-NEXT: movzbl (%rsi), %eax ; FALLBACK13-NEXT: leal (,%rax,8), %ecx ; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: andb $24, %al ; FALLBACK13-NEXT: negb %al ; FALLBACK13-NEXT: movsbq %al, %rax ; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK13-NEXT: shldq %cl, %rax, %rsi ; FALLBACK13-NEXT: movq %r8, %r9 ; FALLBACK13-NEXT: shlq %cl, %r9 ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: shldq %cl, %r8, %rax ; FALLBACK13-NEXT: movq %rax, 8(%rdx) ; FALLBACK13-NEXT: movq %rsi, 16(%rdx) ; FALLBACK13-NEXT: movq %rdi, 24(%rdx) ; FALLBACK13-NEXT: movq %r9, (%rdx) ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: shl_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK14-NEXT: movzbl (%rsi), %ecx ; FALLBACK14-NEXT: leal (,%rcx,8), %eax ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: andb $24, %cl ; FALLBACK14-NEXT: negb %cl ; FALLBACK14-NEXT: movsbq %cl, %rcx ; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi ; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi ; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 ; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 ; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx ; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 ; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 ; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK14-NEXT: notb %al ; FALLBACK14-NEXT: shrq %rdi ; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %rsi, %rdi ; FALLBACK14-NEXT: shrq %rcx ; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx ; FALLBACK14-NEXT: orq %r8, %rcx ; FALLBACK14-NEXT: shrq %r9 ; FALLBACK14-NEXT: shrxq %rax, %r9, %rax ; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: movq %r11, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) ; FALLBACK14-NEXT: movq %rcx, 16(%rdx) ; FALLBACK14-NEXT: movq %rdi, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: shl_32bytes: ; FALLBACK15: # %bb.0: ; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK15-NEXT: movzbl (%rsi), %eax ; FALLBACK15-NEXT: leal (,%rax,8), %ecx ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: andb $24, %al ; FALLBACK15-NEXT: negb %al ; FALLBACK15-NEXT: movsbq %al, %rax ; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK15-NEXT: shldq %cl, %rax, %rsi ; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK15-NEXT: shldq %cl, %r8, %rax ; FALLBACK15-NEXT: movq %rax, 8(%rdx) ; FALLBACK15-NEXT: movq %rsi, 16(%rdx) ; FALLBACK15-NEXT: movq %rdi, 24(%rdx) ; FALLBACK15-NEXT: movq %r9, (%rdx) ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; ; FALLBACK16-LABEL: shl_32bytes: ; FALLBACK16: # %bb.0: ; FALLBACK16-NEXT: pushl %ebp ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $108, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK16-NEXT: movl (%ecx), %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 4(%ecx), %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 8(%ecx), %esi ; FALLBACK16-NEXT: movl 12(%ecx), %edi ; FALLBACK16-NEXT: movl 16(%ecx), %ebx ; FALLBACK16-NEXT: movb (%eax), %ah ; FALLBACK16-NEXT: movl 20(%ecx), %ebp ; FALLBACK16-NEXT: movl 24(%ecx), %edx ; FALLBACK16-NEXT: movl 28(%ecx), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movb %ah, %ch ; FALLBACK16-NEXT: shlb $3, %ch ; FALLBACK16-NEXT: xorps %xmm0, %xmm0 ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: andb $28, %ah ; FALLBACK16-NEXT: negb %ah ; FALLBACK16-NEXT: movsbl %ah, %ebx ; FALLBACK16-NEXT: movl 64(%esp,%ebx), %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 68(%esp,%ebx), %eax ; FALLBACK16-NEXT: movl %eax, %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: movb %ch, %dl ; FALLBACK16-NEXT: notb %dl ; FALLBACK16-NEXT: shrl %edi ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: orl %esi, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 76(%esp,%ebx), %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: movl 72(%esp,%ebx), %esi ; FALLBACK16-NEXT: movl %esi, %ebp ; FALLBACK16-NEXT: shrl %ebp ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: orl %edi, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: shrl %eax ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: orl %esi, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 84(%esp,%ebx), %esi ; FALLBACK16-NEXT: movl %esi, %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi ; FALLBACK16-NEXT: movl %edi, %ebp ; FALLBACK16-NEXT: shrl %ebp ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: shrl %eax ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: orl %edi, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 92(%esp,%ebx), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: movl 88(%esp,%ebx), %edi ; FALLBACK16-NEXT: movl %edi, %ebx ; FALLBACK16-NEXT: shrl %ebx ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: orl %eax, %ebx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: shrl %esi ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: orl %edi, %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl %edx, (%eax) ; FALLBACK16-NEXT: movl %esi, 24(%eax) ; FALLBACK16-NEXT: movl %ebx, 28(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 16(%eax) ; FALLBACK16-NEXT: movl %ebp, 20(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 8(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 12(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 4(%eax) ; FALLBACK16-NEXT: addl $108, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx ; FALLBACK16-NEXT: popl %ebp ; FALLBACK16-NEXT: retl ; ; FALLBACK17-LABEL: shl_32bytes: ; FALLBACK17: # %bb.0: ; FALLBACK17-NEXT: pushl %ebp ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $92, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl (%eax), %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 4(%eax), %edx ; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: movl 8(%eax), %esi ; FALLBACK17-NEXT: movl 12(%eax), %edi ; FALLBACK17-NEXT: movl 16(%eax), %ebx ; FALLBACK17-NEXT: movb (%ecx), %ch ; FALLBACK17-NEXT: movl 20(%eax), %ebp ; FALLBACK17-NEXT: movl 24(%eax), %edx ; FALLBACK17-NEXT: movl 28(%eax), %eax ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movb %ch, %cl ; FALLBACK17-NEXT: shlb $3, %cl ; FALLBACK17-NEXT: xorps %xmm0, %xmm0 ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: andb $28, %ch ; FALLBACK17-NEXT: negb %ch ; FALLBACK17-NEXT: movsbl %ch, %eax ; FALLBACK17-NEXT: movl 56(%esp,%eax), %edx ; FALLBACK17-NEXT: movl 60(%esp,%eax), %ebx ; FALLBACK17-NEXT: movl %ebx, %esi ; FALLBACK17-NEXT: shldl %cl, %edx, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 52(%esp,%eax), %esi ; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 64(%esp,%eax), %edi ; FALLBACK17-NEXT: movl 68(%esp,%eax), %ebp ; FALLBACK17-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %edi, %ebp ; FALLBACK17-NEXT: shldl %cl, %ebx, %edi ; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebx ; FALLBACK17-NEXT: movl 72(%esp,%eax), %edx ; FALLBACK17-NEXT: movl 76(%esp,%eax), %esi ; FALLBACK17-NEXT: shldl %cl, %edx, %esi ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: shldl %cl, %eax, %edx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl %edx, 24(%eax) ; FALLBACK17-NEXT: movl %esi, 28(%eax) ; FALLBACK17-NEXT: movl %edi, 16(%eax) ; FALLBACK17-NEXT: movl %ebp, 20(%eax) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, 8(%eax) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, 12(%eax) ; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload ; FALLBACK17-NEXT: shldl %cl, %ebx, %edx ; FALLBACK17-NEXT: shll %cl, %ebx ; FALLBACK17-NEXT: movl %ebx, (%eax) ; FALLBACK17-NEXT: movl %edx, 4(%eax) ; FALLBACK17-NEXT: addl $92, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx ; FALLBACK17-NEXT: popl %ebp ; FALLBACK17-NEXT: retl ; ; FALLBACK18-LABEL: shl_32bytes: ; FALLBACK18: # %bb.0: ; FALLBACK18-NEXT: pushl %ebp ; FALLBACK18-NEXT: pushl %ebx ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl (%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 4(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 8(%eax), %esi ; FALLBACK18-NEXT: movl 12(%eax), %edi ; FALLBACK18-NEXT: movl 16(%eax), %ebp ; FALLBACK18-NEXT: movzbl (%ebx), %ebx ; FALLBACK18-NEXT: movl 20(%eax), %edx ; FALLBACK18-NEXT: movl 24(%eax), %ecx ; FALLBACK18-NEXT: movl 28(%eax), %eax ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, %edx ; FALLBACK18-NEXT: shlb $3, %dl ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: andb $28, %bl ; FALLBACK18-NEXT: negb %bl ; FALLBACK18-NEXT: movsbl %bl, %esi ; FALLBACK18-NEXT: movl 64(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 68(%esp,%esi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, %eax, %edi ; FALLBACK18-NEXT: movl %edx, %ecx ; FALLBACK18-NEXT: notb %cl ; FALLBACK18-NEXT: shrl %ebx ; FALLBACK18-NEXT: shrxl %ecx, %ebx, %ebx ; FALLBACK18-NEXT: orl %edi, %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 72(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, %edi ; FALLBACK18-NEXT: shrl %edi ; FALLBACK18-NEXT: shrxl %ecx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%esi), %edi ; FALLBACK18-NEXT: shlxl %edx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebx ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: shrl %eax ; FALLBACK18-NEXT: shrxl %ecx, %eax, %eax ; FALLBACK18-NEXT: orl %ebx, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 80(%esp,%esi), %ebx ; FALLBACK18-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ebx ; FALLBACK18-NEXT: shrxl %ecx, %ebx, %eax ; FALLBACK18-NEXT: movl 84(%esp,%esi), %ebx ; FALLBACK18-NEXT: shlxl %edx, %ebx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %edi ; FALLBACK18-NEXT: shrxl %ecx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, 92(%esp,%esi), %ebp ; FALLBACK18-NEXT: movl 88(%esp,%esi), %esi ; FALLBACK18-NEXT: shlxl %edx, %esi, %eax ; FALLBACK18-NEXT: shrl %esi ; FALLBACK18-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK18-NEXT: orl %ebp, %esi ; FALLBACK18-NEXT: shrl %ebx ; FALLBACK18-NEXT: shrxl %ecx, %ebx, %edx ; FALLBACK18-NEXT: orl %eax, %edx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, (%eax) ; FALLBACK18-NEXT: movl %edx, 24(%eax) ; FALLBACK18-NEXT: movl %esi, 28(%eax) ; FALLBACK18-NEXT: movl %edi, 16(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 20(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 8(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 12(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 4(%eax) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi ; FALLBACK18-NEXT: popl %ebx ; FALLBACK18-NEXT: popl %ebp ; FALLBACK18-NEXT: retl ; ; FALLBACK19-LABEL: shl_32bytes: ; FALLBACK19: # %bb.0: ; FALLBACK19-NEXT: pushl %ebp ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $92, %esp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebx ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl (%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 4(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 8(%ecx), %esi ; FALLBACK19-NEXT: movl 12(%ecx), %edi ; FALLBACK19-NEXT: movl 16(%ecx), %ebp ; FALLBACK19-NEXT: movzbl (%ebx), %ebx ; FALLBACK19-NEXT: movl 20(%ecx), %edx ; FALLBACK19-NEXT: movl 24(%ecx), %eax ; FALLBACK19-NEXT: movl 28(%ecx), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebx, %ecx ; FALLBACK19-NEXT: shlb $3, %cl ; FALLBACK19-NEXT: xorps %xmm0, %xmm0 ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: andb $28, %bl ; FALLBACK19-NEXT: negb %bl ; FALLBACK19-NEXT: movsbl %bl, %eax ; FALLBACK19-NEXT: movl 56(%esp,%eax), %edx ; FALLBACK19-NEXT: movl 60(%esp,%eax), %esi ; FALLBACK19-NEXT: movl %esi, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: shldl %cl, %edx, %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 52(%esp,%eax), %ebx ; FALLBACK19-NEXT: shldl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 64(%esp,%eax), %edi ; FALLBACK19-NEXT: movl 68(%esp,%eax), %ebp ; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shldl %cl, %edi, %ebp ; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload ; FALLBACK19-NEXT: shldl %cl, %edx, %edi ; FALLBACK19-NEXT: movl 48(%esp,%eax), %edx ; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: movl 72(%esp,%eax), %edx ; FALLBACK19-NEXT: movl 76(%esp,%eax), %esi ; FALLBACK19-NEXT: shldl %cl, %edx, %esi ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: shldl %cl, %eax, %edx ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl %edx, 24(%eax) ; FALLBACK19-NEXT: movl %esi, 28(%eax) ; FALLBACK19-NEXT: movl %edi, 16(%eax) ; FALLBACK19-NEXT: movl %ebp, 20(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, 8(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, 12(%eax) ; FALLBACK19-NEXT: movl (%esp), %esi # 4-byte Reload ; FALLBACK19-NEXT: shlxl %ecx, %esi, %edx ; FALLBACK19-NEXT: movl %edx, (%eax) ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK19-NEXT: shldl %cl, %esi, %ebx ; FALLBACK19-NEXT: movl %ebx, 4(%eax) ; FALLBACK19-NEXT: addl $92, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx ; FALLBACK19-NEXT: popl %ebp ; FALLBACK19-NEXT: retl ; ; FALLBACK20-LABEL: shl_32bytes: ; FALLBACK20: # %bb.0: ; FALLBACK20-NEXT: pushl %ebp ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi ; FALLBACK20-NEXT: subl $108, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK20-NEXT: movzbl (%eax), %ecx ; FALLBACK20-NEXT: movb %cl, %dh ; FALLBACK20-NEXT: shlb $3, %dh ; FALLBACK20-NEXT: xorps %xmm2, %xmm2 ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $28, %cl ; FALLBACK20-NEXT: negb %cl ; FALLBACK20-NEXT: movsbl %cl, %ebx ; FALLBACK20-NEXT: movl 84(%esp,%ebx), %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: movb %dh, %dl ; FALLBACK20-NEXT: notb %dl ; FALLBACK20-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK20-NEXT: movl %esi, %eax ; FALLBACK20-NEXT: shrl %eax ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: orl %edi, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: movl %ebx, %edi ; FALLBACK20-NEXT: movl 76(%esp,%ebx), %ebp ; FALLBACK20-NEXT: movl %ebp, %eax ; FALLBACK20-NEXT: shrl %eax ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: orl %esi, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ebx ; FALLBACK20-NEXT: movl %ebx, %eax ; FALLBACK20-NEXT: shrl %eax ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 68(%esp,%edi), %ebp ; FALLBACK20-NEXT: movl %ebp, %esi ; FALLBACK20-NEXT: shrl %esi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: movl 64(%esp,%edi), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: shrl %ebx ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: orl %ebp, %ebx ; FALLBACK20-NEXT: movl 88(%esp,%edi), %ebp ; FALLBACK20-NEXT: movl %ebp, %edi ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: shrl %eax ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: orl %edi, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: movl 92(%esp,%eax), %edi ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: shrl %ebp ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: orl %edi, %ebp ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl %edx, (%eax) ; FALLBACK20-NEXT: movl %ebp, 28(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 24(%eax) ; FALLBACK20-NEXT: movl %ebx, 4(%eax) ; FALLBACK20-NEXT: movl %esi, 8(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 12(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 16(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 20(%eax) ; FALLBACK20-NEXT: addl $108, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx ; FALLBACK20-NEXT: popl %ebp ; FALLBACK20-NEXT: retl ; ; FALLBACK21-LABEL: shl_32bytes: ; FALLBACK21: # %bb.0: ; FALLBACK21-NEXT: pushl %ebp ; FALLBACK21-NEXT: pushl %ebx ; FALLBACK21-NEXT: pushl %edi ; FALLBACK21-NEXT: pushl %esi ; FALLBACK21-NEXT: subl $92, %esp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK21-NEXT: movups (%ecx), %xmm0 ; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK21-NEXT: movzbl (%eax), %eax ; FALLBACK21-NEXT: movl %eax, %ecx ; FALLBACK21-NEXT: shlb $3, %cl ; FALLBACK21-NEXT: xorps %xmm2, %xmm2 ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: andb $28, %al ; FALLBACK21-NEXT: negb %al ; FALLBACK21-NEXT: movsbl %al, %ebp ; FALLBACK21-NEXT: movl 64(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl 68(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 60(%esp,%ebp), %edx ; FALLBACK21-NEXT: shldl %cl, %edx, %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edi ; FALLBACK21-NEXT: shldl %cl, %edi, %edx ; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK21-NEXT: movl 52(%esp,%ebp), %ebx ; FALLBACK21-NEXT: shldl %cl, %ebx, %edi ; FALLBACK21-NEXT: movl 72(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl %edx, %eax ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK21-NEXT: shldl %cl, %esi, %eax ; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi ; FALLBACK21-NEXT: movl 76(%esp,%ebp), %ebp ; FALLBACK21-NEXT: shldl %cl, %edx, %ebp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK21-NEXT: movl %ebp, 28(%edx) ; FALLBACK21-NEXT: movl %eax, 24(%edx) ; FALLBACK21-NEXT: movl %esi, %eax ; FALLBACK21-NEXT: shll %cl, %eax ; FALLBACK21-NEXT: shldl %cl, %esi, %ebx ; FALLBACK21-NEXT: movl %ebx, 4(%edx) ; FALLBACK21-NEXT: movl %edi, 8(%edx) ; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload ; FALLBACK21-NEXT: movl %ecx, 12(%edx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK21-NEXT: movl %ecx, 16(%edx) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK21-NEXT: movl %ecx, 20(%edx) ; FALLBACK21-NEXT: movl %eax, (%edx) ; FALLBACK21-NEXT: addl $92, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi ; FALLBACK21-NEXT: popl %ebx ; FALLBACK21-NEXT: popl %ebp ; FALLBACK21-NEXT: retl ; ; FALLBACK22-LABEL: shl_32bytes: ; FALLBACK22: # %bb.0: ; FALLBACK22-NEXT: pushl %ebp ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi ; FALLBACK22-NEXT: subl $108, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movzbl (%eax), %ecx ; FALLBACK22-NEXT: movl %ecx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: xorps %xmm2, %xmm2 ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: andb $28, %cl ; FALLBACK22-NEXT: negb %cl ; FALLBACK22-NEXT: movsbl %cl, %edx ; FALLBACK22-NEXT: movl 84(%esp,%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx ; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi ; FALLBACK22-NEXT: shlxl %eax, %esi, %edi ; FALLBACK22-NEXT: movl %eax, %ebx ; FALLBACK22-NEXT: notb %bl ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %ecx, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 76(%esp,%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, %esi ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %edi, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %eax, %ecx, %ecx ; FALLBACK22-NEXT: movl 72(%esp,%edx), %esi ; FALLBACK22-NEXT: movl %esi, %edi ; FALLBACK22-NEXT: shrl %edi ; FALLBACK22-NEXT: shrxl %ebx, %edi, %edi ; FALLBACK22-NEXT: orl %ecx, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %eax, %esi, %ecx ; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi ; FALLBACK22-NEXT: movl %esi, %edi ; FALLBACK22-NEXT: shrl %edi ; FALLBACK22-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK22-NEXT: orl %ecx, %ebp ; FALLBACK22-NEXT: shlxl %eax, %esi, %edi ; FALLBACK22-NEXT: movl 64(%esp,%edx), %esi ; FALLBACK22-NEXT: movl %esi, %ecx ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: orl %edi, %ecx ; FALLBACK22-NEXT: shlxl %eax, %esi, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %eax, 92(%esp,%edx), %edi ; FALLBACK22-NEXT: movl 88(%esp,%edx), %edx ; FALLBACK22-NEXT: shlxl %eax, %edx, %esi ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: shrl %eax ; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: shrl %edx ; FALLBACK22-NEXT: shrxl %ebx, %edx, %edx ; FALLBACK22-NEXT: orl %edi, %edx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK22-NEXT: movl %edi, (%esi) ; FALLBACK22-NEXT: movl %edx, 28(%esi) ; FALLBACK22-NEXT: movl %eax, 24(%esi) ; FALLBACK22-NEXT: movl %ecx, 4(%esi) ; FALLBACK22-NEXT: movl %ebp, 8(%esi) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 12(%esi) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 16(%esi) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 20(%esi) ; FALLBACK22-NEXT: addl $108, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx ; FALLBACK22-NEXT: popl %ebp ; FALLBACK22-NEXT: retl ; ; FALLBACK23-LABEL: shl_32bytes: ; FALLBACK23: # %bb.0: ; FALLBACK23-NEXT: pushl %ebp ; FALLBACK23-NEXT: pushl %ebx ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi ; FALLBACK23-NEXT: subl $92, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK23-NEXT: movups (%ecx), %xmm0 ; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK23-NEXT: movzbl (%eax), %eax ; FALLBACK23-NEXT: movl %eax, %ecx ; FALLBACK23-NEXT: shlb $3, %cl ; FALLBACK23-NEXT: xorps %xmm2, %xmm2 ; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: andb $28, %al ; FALLBACK23-NEXT: negb %al ; FALLBACK23-NEXT: movsbl %al, %ebx ; FALLBACK23-NEXT: movl 64(%esp,%ebx), %eax ; FALLBACK23-NEXT: movl 68(%esp,%ebx), %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %eax, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 60(%esp,%ebx), %edx ; FALLBACK23-NEXT: shldl %cl, %edx, %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 56(%esp,%ebx), %edi ; FALLBACK23-NEXT: shldl %cl, %edi, %edx ; FALLBACK23-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK23-NEXT: movl 52(%esp,%ebx), %ebp ; FALLBACK23-NEXT: shldl %cl, %ebp, %edi ; FALLBACK23-NEXT: movl 72(%esp,%ebx), %edx ; FALLBACK23-NEXT: movl %edx, %eax ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK23-NEXT: shldl %cl, %esi, %eax ; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi ; FALLBACK23-NEXT: movl 76(%esp,%ebx), %ebx ; FALLBACK23-NEXT: shldl %cl, %edx, %ebx ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK23-NEXT: movl %ebx, 28(%edx) ; FALLBACK23-NEXT: movl %eax, 24(%edx) ; FALLBACK23-NEXT: shlxl %ecx, %esi, %eax ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: shldl %cl, %esi, %ebp ; FALLBACK23-NEXT: movl %ebp, 4(%edx) ; FALLBACK23-NEXT: movl %edi, 8(%edx) ; FALLBACK23-NEXT: movl (%esp), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 12(%edx) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 16(%edx) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 20(%edx) ; FALLBACK23-NEXT: movl %eax, (%edx) ; FALLBACK23-NEXT: addl $92, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi ; FALLBACK23-NEXT: popl %ebx ; FALLBACK23-NEXT: popl %ebp ; FALLBACK23-NEXT: retl ; ; FALLBACK24-LABEL: shl_32bytes: ; FALLBACK24: # %bb.0: ; FALLBACK24-NEXT: pushl %ebp ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi ; FALLBACK24-NEXT: subl $108, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK24-NEXT: movzbl (%eax), %ecx ; FALLBACK24-NEXT: movb %cl, %dh ; FALLBACK24-NEXT: shlb $3, %dh ; FALLBACK24-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $28, %cl ; FALLBACK24-NEXT: negb %cl ; FALLBACK24-NEXT: movsbl %cl, %ebx ; FALLBACK24-NEXT: movl 84(%esp,%ebx), %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: movb %dh, %dl ; FALLBACK24-NEXT: notb %dl ; FALLBACK24-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK24-NEXT: movl %esi, %eax ; FALLBACK24-NEXT: shrl %eax ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: orl %edi, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: movl %ebx, %edi ; FALLBACK24-NEXT: movl 76(%esp,%ebx), %ebp ; FALLBACK24-NEXT: movl %ebp, %eax ; FALLBACK24-NEXT: shrl %eax ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: orl %esi, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ebx ; FALLBACK24-NEXT: movl %ebx, %eax ; FALLBACK24-NEXT: shrl %eax ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 68(%esp,%edi), %ebp ; FALLBACK24-NEXT: movl %ebp, %esi ; FALLBACK24-NEXT: shrl %esi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: movl 64(%esp,%edi), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: shrl %ebx ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: orl %ebp, %ebx ; FALLBACK24-NEXT: movl 88(%esp,%edi), %ebp ; FALLBACK24-NEXT: movl %ebp, %edi ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: shrl %eax ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: orl %edi, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: movl 92(%esp,%eax), %edi ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: shrl %ebp ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: orl %edi, %ebp ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl %edx, (%eax) ; FALLBACK24-NEXT: movl %ebp, 28(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 24(%eax) ; FALLBACK24-NEXT: movl %ebx, 4(%eax) ; FALLBACK24-NEXT: movl %esi, 8(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 12(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 16(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 20(%eax) ; FALLBACK24-NEXT: addl $108, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx ; FALLBACK24-NEXT: popl %ebp ; FALLBACK24-NEXT: vzeroupper ; FALLBACK24-NEXT: retl ; ; FALLBACK25-LABEL: shl_32bytes: ; FALLBACK25: # %bb.0: ; FALLBACK25-NEXT: pushl %ebp ; FALLBACK25-NEXT: pushl %ebx ; FALLBACK25-NEXT: pushl %edi ; FALLBACK25-NEXT: pushl %esi ; FALLBACK25-NEXT: subl $92, %esp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK25-NEXT: movzbl (%eax), %eax ; FALLBACK25-NEXT: movl %eax, %ecx ; FALLBACK25-NEXT: shlb $3, %cl ; FALLBACK25-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: andb $28, %al ; FALLBACK25-NEXT: negb %al ; FALLBACK25-NEXT: movsbl %al, %ebp ; FALLBACK25-NEXT: movl 64(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl 68(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 60(%esp,%ebp), %edx ; FALLBACK25-NEXT: shldl %cl, %edx, %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edi ; FALLBACK25-NEXT: shldl %cl, %edi, %edx ; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK25-NEXT: movl 52(%esp,%ebp), %ebx ; FALLBACK25-NEXT: shldl %cl, %ebx, %edi ; FALLBACK25-NEXT: movl 72(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl %edx, %eax ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK25-NEXT: shldl %cl, %esi, %eax ; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi ; FALLBACK25-NEXT: movl 76(%esp,%ebp), %ebp ; FALLBACK25-NEXT: shldl %cl, %edx, %ebp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK25-NEXT: movl %ebp, 28(%edx) ; FALLBACK25-NEXT: movl %eax, 24(%edx) ; FALLBACK25-NEXT: movl %esi, %eax ; FALLBACK25-NEXT: shll %cl, %eax ; FALLBACK25-NEXT: shldl %cl, %esi, %ebx ; FALLBACK25-NEXT: movl %ebx, 4(%edx) ; FALLBACK25-NEXT: movl %edi, 8(%edx) ; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload ; FALLBACK25-NEXT: movl %ecx, 12(%edx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK25-NEXT: movl %ecx, 16(%edx) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK25-NEXT: movl %ecx, 20(%edx) ; FALLBACK25-NEXT: movl %eax, (%edx) ; FALLBACK25-NEXT: addl $92, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi ; FALLBACK25-NEXT: popl %ebx ; FALLBACK25-NEXT: popl %ebp ; FALLBACK25-NEXT: vzeroupper ; FALLBACK25-NEXT: retl ; ; FALLBACK26-LABEL: shl_32bytes: ; FALLBACK26: # %bb.0: ; FALLBACK26-NEXT: pushl %ebp ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi ; FALLBACK26-NEXT: subl $108, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK26-NEXT: movzbl (%eax), %ecx ; FALLBACK26-NEXT: movl %ecx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: andb $28, %cl ; FALLBACK26-NEXT: negb %cl ; FALLBACK26-NEXT: movsbl %cl, %edx ; FALLBACK26-NEXT: movl 84(%esp,%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx ; FALLBACK26-NEXT: movl 80(%esp,%edx), %esi ; FALLBACK26-NEXT: shlxl %eax, %esi, %edi ; FALLBACK26-NEXT: movl %eax, %ebx ; FALLBACK26-NEXT: notb %bl ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: orl %ecx, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 76(%esp,%edx), %ecx ; FALLBACK26-NEXT: movl %ecx, %esi ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: orl %edi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %eax, %ecx, %ecx ; FALLBACK26-NEXT: movl 72(%esp,%edx), %esi ; FALLBACK26-NEXT: movl %esi, %edi ; FALLBACK26-NEXT: shrl %edi ; FALLBACK26-NEXT: shrxl %ebx, %edi, %edi ; FALLBACK26-NEXT: orl %ecx, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %eax, %esi, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%edx), %esi ; FALLBACK26-NEXT: movl %esi, %edi ; FALLBACK26-NEXT: shrl %edi ; FALLBACK26-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK26-NEXT: orl %ecx, %ebp ; FALLBACK26-NEXT: shlxl %eax, %esi, %edi ; FALLBACK26-NEXT: movl 64(%esp,%edx), %esi ; FALLBACK26-NEXT: movl %esi, %ecx ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: orl %edi, %ecx ; FALLBACK26-NEXT: shlxl %eax, %esi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %eax, 92(%esp,%edx), %edi ; FALLBACK26-NEXT: movl 88(%esp,%edx), %edx ; FALLBACK26-NEXT: shlxl %eax, %edx, %esi ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: shrl %eax ; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: shrl %edx ; FALLBACK26-NEXT: shrxl %ebx, %edx, %edx ; FALLBACK26-NEXT: orl %edi, %edx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK26-NEXT: movl %edi, (%esi) ; FALLBACK26-NEXT: movl %edx, 28(%esi) ; FALLBACK26-NEXT: movl %eax, 24(%esi) ; FALLBACK26-NEXT: movl %ecx, 4(%esi) ; FALLBACK26-NEXT: movl %ebp, 8(%esi) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 12(%esi) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 16(%esi) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 20(%esi) ; FALLBACK26-NEXT: addl $108, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx ; FALLBACK26-NEXT: popl %ebp ; FALLBACK26-NEXT: vzeroupper ; FALLBACK26-NEXT: retl ; ; FALLBACK27-LABEL: shl_32bytes: ; FALLBACK27: # %bb.0: ; FALLBACK27-NEXT: pushl %ebp ; FALLBACK27-NEXT: pushl %ebx ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi ; FALLBACK27-NEXT: subl $92, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK27-NEXT: movzbl (%eax), %eax ; FALLBACK27-NEXT: movl %eax, %ecx ; FALLBACK27-NEXT: shlb $3, %cl ; FALLBACK27-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: andb $28, %al ; FALLBACK27-NEXT: negb %al ; FALLBACK27-NEXT: movsbl %al, %ebx ; FALLBACK27-NEXT: movl 64(%esp,%ebx), %eax ; FALLBACK27-NEXT: movl 68(%esp,%ebx), %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %eax, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 60(%esp,%ebx), %edx ; FALLBACK27-NEXT: shldl %cl, %edx, %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 56(%esp,%ebx), %edi ; FALLBACK27-NEXT: shldl %cl, %edi, %edx ; FALLBACK27-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK27-NEXT: movl 52(%esp,%ebx), %ebp ; FALLBACK27-NEXT: shldl %cl, %ebp, %edi ; FALLBACK27-NEXT: movl 72(%esp,%ebx), %edx ; FALLBACK27-NEXT: movl %edx, %eax ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK27-NEXT: shldl %cl, %esi, %eax ; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi ; FALLBACK27-NEXT: movl 76(%esp,%ebx), %ebx ; FALLBACK27-NEXT: shldl %cl, %edx, %ebx ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK27-NEXT: movl %ebx, 28(%edx) ; FALLBACK27-NEXT: movl %eax, 24(%edx) ; FALLBACK27-NEXT: shlxl %ecx, %esi, %eax ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: shldl %cl, %esi, %ebp ; FALLBACK27-NEXT: movl %ebp, 4(%edx) ; FALLBACK27-NEXT: movl %edi, 8(%edx) ; FALLBACK27-NEXT: movl (%esp), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 12(%edx) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 16(%edx) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 20(%edx) ; FALLBACK27-NEXT: movl %eax, (%edx) ; FALLBACK27-NEXT: addl $92, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi ; FALLBACK27-NEXT: popl %ebx ; FALLBACK27-NEXT: popl %ebp ; FALLBACK27-NEXT: vzeroupper ; FALLBACK27-NEXT: retl ; ; FALLBACK28-LABEL: shl_32bytes: ; FALLBACK28: # %bb.0: ; FALLBACK28-NEXT: pushl %ebp ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi ; FALLBACK28-NEXT: subl $108, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK28-NEXT: movzbl (%eax), %ecx ; FALLBACK28-NEXT: movb %cl, %dh ; FALLBACK28-NEXT: shlb $3, %dh ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $28, %cl ; FALLBACK28-NEXT: negb %cl ; FALLBACK28-NEXT: movsbl %cl, %ebx ; FALLBACK28-NEXT: movl 84(%esp,%ebx), %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: movb %dh, %dl ; FALLBACK28-NEXT: notb %dl ; FALLBACK28-NEXT: movl 80(%esp,%ebx), %esi ; FALLBACK28-NEXT: movl %esi, %eax ; FALLBACK28-NEXT: shrl %eax ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: orl %edi, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: movl %ebx, %edi ; FALLBACK28-NEXT: movl 76(%esp,%ebx), %ebp ; FALLBACK28-NEXT: movl %ebp, %eax ; FALLBACK28-NEXT: shrl %eax ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: orl %esi, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ebx ; FALLBACK28-NEXT: movl %ebx, %eax ; FALLBACK28-NEXT: shrl %eax ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 68(%esp,%edi), %ebp ; FALLBACK28-NEXT: movl %ebp, %esi ; FALLBACK28-NEXT: shrl %esi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: movl 64(%esp,%edi), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: shrl %ebx ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: orl %ebp, %ebx ; FALLBACK28-NEXT: movl 88(%esp,%edi), %ebp ; FALLBACK28-NEXT: movl %ebp, %edi ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: shrl %eax ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: orl %edi, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: movl 92(%esp,%eax), %edi ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: shrl %ebp ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: orl %edi, %ebp ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl %edx, (%eax) ; FALLBACK28-NEXT: movl %ebp, 28(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 24(%eax) ; FALLBACK28-NEXT: movl %ebx, 4(%eax) ; FALLBACK28-NEXT: movl %esi, 8(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 12(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 16(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 20(%eax) ; FALLBACK28-NEXT: addl $108, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx ; FALLBACK28-NEXT: popl %ebp ; FALLBACK28-NEXT: vzeroupper ; FALLBACK28-NEXT: retl ; ; FALLBACK29-LABEL: shl_32bytes: ; FALLBACK29: # %bb.0: ; FALLBACK29-NEXT: pushl %ebp ; FALLBACK29-NEXT: pushl %ebx ; FALLBACK29-NEXT: pushl %edi ; FALLBACK29-NEXT: pushl %esi ; FALLBACK29-NEXT: subl $92, %esp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK29-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK29-NEXT: movzbl (%eax), %eax ; FALLBACK29-NEXT: movl %eax, %ecx ; FALLBACK29-NEXT: shlb $3, %cl ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: andb $28, %al ; FALLBACK29-NEXT: negb %al ; FALLBACK29-NEXT: movsbl %al, %ebp ; FALLBACK29-NEXT: movl 64(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl 68(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 60(%esp,%ebp), %edx ; FALLBACK29-NEXT: shldl %cl, %edx, %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edi ; FALLBACK29-NEXT: shldl %cl, %edi, %edx ; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK29-NEXT: movl 52(%esp,%ebp), %ebx ; FALLBACK29-NEXT: shldl %cl, %ebx, %edi ; FALLBACK29-NEXT: movl 72(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl %edx, %eax ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK29-NEXT: shldl %cl, %esi, %eax ; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi ; FALLBACK29-NEXT: movl 76(%esp,%ebp), %ebp ; FALLBACK29-NEXT: shldl %cl, %edx, %ebp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK29-NEXT: movl %ebp, 28(%edx) ; FALLBACK29-NEXT: movl %eax, 24(%edx) ; FALLBACK29-NEXT: movl %esi, %eax ; FALLBACK29-NEXT: shll %cl, %eax ; FALLBACK29-NEXT: shldl %cl, %esi, %ebx ; FALLBACK29-NEXT: movl %ebx, 4(%edx) ; FALLBACK29-NEXT: movl %edi, 8(%edx) ; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload ; FALLBACK29-NEXT: movl %ecx, 12(%edx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK29-NEXT: movl %ecx, 16(%edx) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK29-NEXT: movl %ecx, 20(%edx) ; FALLBACK29-NEXT: movl %eax, (%edx) ; FALLBACK29-NEXT: addl $92, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi ; FALLBACK29-NEXT: popl %ebx ; FALLBACK29-NEXT: popl %ebp ; FALLBACK29-NEXT: vzeroupper ; FALLBACK29-NEXT: retl ; ; FALLBACK30-LABEL: shl_32bytes: ; FALLBACK30: # %bb.0: ; FALLBACK30-NEXT: pushl %ebp ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi ; FALLBACK30-NEXT: subl $108, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK30-NEXT: movzbl (%eax), %ecx ; FALLBACK30-NEXT: movl %ecx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: andb $28, %cl ; FALLBACK30-NEXT: negb %cl ; FALLBACK30-NEXT: movsbl %cl, %edx ; FALLBACK30-NEXT: movl 84(%esp,%edx), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx ; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi ; FALLBACK30-NEXT: shlxl %eax, %esi, %edi ; FALLBACK30-NEXT: movl %eax, %ebx ; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: orl %ecx, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 76(%esp,%edx), %ecx ; FALLBACK30-NEXT: movl %ecx, %esi ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: orl %edi, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %eax, %ecx, %ecx ; FALLBACK30-NEXT: movl 72(%esp,%edx), %esi ; FALLBACK30-NEXT: movl %esi, %edi ; FALLBACK30-NEXT: shrl %edi ; FALLBACK30-NEXT: shrxl %ebx, %edi, %edi ; FALLBACK30-NEXT: orl %ecx, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %eax, %esi, %ecx ; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi ; FALLBACK30-NEXT: movl %esi, %edi ; FALLBACK30-NEXT: shrl %edi ; FALLBACK30-NEXT: shrxl %ebx, %edi, %ebp ; FALLBACK30-NEXT: orl %ecx, %ebp ; FALLBACK30-NEXT: shlxl %eax, %esi, %edi ; FALLBACK30-NEXT: movl 64(%esp,%edx), %esi ; FALLBACK30-NEXT: movl %esi, %ecx ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: orl %edi, %ecx ; FALLBACK30-NEXT: shlxl %eax, %esi, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %eax, 92(%esp,%edx), %edi ; FALLBACK30-NEXT: movl 88(%esp,%edx), %edx ; FALLBACK30-NEXT: shlxl %eax, %edx, %esi ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: shrl %eax ; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: shrl %edx ; FALLBACK30-NEXT: shrxl %ebx, %edx, %edx ; FALLBACK30-NEXT: orl %edi, %edx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK30-NEXT: movl %edi, (%esi) ; FALLBACK30-NEXT: movl %edx, 28(%esi) ; FALLBACK30-NEXT: movl %eax, 24(%esi) ; FALLBACK30-NEXT: movl %ecx, 4(%esi) ; FALLBACK30-NEXT: movl %ebp, 8(%esi) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 12(%esi) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 16(%esi) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 20(%esi) ; FALLBACK30-NEXT: addl $108, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx ; FALLBACK30-NEXT: popl %ebp ; FALLBACK30-NEXT: vzeroupper ; FALLBACK30-NEXT: retl ; ; FALLBACK31-LABEL: shl_32bytes: ; FALLBACK31: # %bb.0: ; FALLBACK31-NEXT: pushl %ebp ; FALLBACK31-NEXT: pushl %ebx ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi ; FALLBACK31-NEXT: subl $92, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK31-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK31-NEXT: movzbl (%eax), %eax ; FALLBACK31-NEXT: movl %eax, %ecx ; FALLBACK31-NEXT: shlb $3, %cl ; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK31-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: andb $28, %al ; FALLBACK31-NEXT: negb %al ; FALLBACK31-NEXT: movsbl %al, %ebx ; FALLBACK31-NEXT: movl 64(%esp,%ebx), %eax ; FALLBACK31-NEXT: movl 68(%esp,%ebx), %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %eax, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 60(%esp,%ebx), %edx ; FALLBACK31-NEXT: shldl %cl, %edx, %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 56(%esp,%ebx), %edi ; FALLBACK31-NEXT: shldl %cl, %edi, %edx ; FALLBACK31-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK31-NEXT: movl 52(%esp,%ebx), %ebp ; FALLBACK31-NEXT: shldl %cl, %ebp, %edi ; FALLBACK31-NEXT: movl 72(%esp,%ebx), %edx ; FALLBACK31-NEXT: movl %edx, %eax ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK31-NEXT: shldl %cl, %esi, %eax ; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi ; FALLBACK31-NEXT: movl 76(%esp,%ebx), %ebx ; FALLBACK31-NEXT: shldl %cl, %edx, %ebx ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK31-NEXT: movl %ebx, 28(%edx) ; FALLBACK31-NEXT: movl %eax, 24(%edx) ; FALLBACK31-NEXT: shlxl %ecx, %esi, %eax ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: shldl %cl, %esi, %ebp ; FALLBACK31-NEXT: movl %ebp, 4(%edx) ; FALLBACK31-NEXT: movl %edi, 8(%edx) ; FALLBACK31-NEXT: movl (%esp), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 12(%edx) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 16(%edx) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 20(%edx) ; FALLBACK31-NEXT: movl %eax, (%edx) ; FALLBACK31-NEXT: addl $92, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi ; FALLBACK31-NEXT: popl %ebx ; FALLBACK31-NEXT: popl %ebp ; FALLBACK31-NEXT: vzeroupper ; FALLBACK31-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 %res = shl i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @shl_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: shl_32bytes_dwordOff: ; FALLBACK0: # %bb.0: ; FALLBACK0-NEXT: pushq %rbx ; FALLBACK0-NEXT: movq (%rdi), %rcx ; FALLBACK0-NEXT: movq 8(%rdi), %r8 ; FALLBACK0-NEXT: movq 16(%rdi), %r9 ; FALLBACK0-NEXT: movq 24(%rdi), %rdi ; FALLBACK0-NEXT: movzbl (%rsi), %esi ; FALLBACK0-NEXT: movl %esi, %eax ; FALLBACK0-NEXT: shlb $5, %al ; FALLBACK0-NEXT: xorps %xmm0, %xmm0 ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: shlb $2, %sil ; FALLBACK0-NEXT: andb $24, %sil ; FALLBACK0-NEXT: negb %sil ; FALLBACK0-NEXT: movsbq %sil, %r10 ; FALLBACK0-NEXT: movq -32(%rsp,%r10), %r8 ; FALLBACK0-NEXT: movq -24(%rsp,%r10), %rdi ; FALLBACK0-NEXT: movq %rdi, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq %r8, %r9 ; FALLBACK0-NEXT: shrq %r9 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r9 ; FALLBACK0-NEXT: orq %r11, %r9 ; FALLBACK0-NEXT: movq -8(%rsp,%r10), %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r11 ; FALLBACK0-NEXT: movq -16(%rsp,%r10), %r10 ; FALLBACK0-NEXT: movq %r10, %rbx ; FALLBACK0-NEXT: shrq %rbx ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx ; FALLBACK0-NEXT: orq %r11, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: shrq %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %rdi ; FALLBACK0-NEXT: orq %r10, %rdi ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 ; FALLBACK0-NEXT: movq %r8, (%rdx) ; FALLBACK0-NEXT: movq %rdi, 16(%rdx) ; FALLBACK0-NEXT: movq %rbx, 24(%rdx) ; FALLBACK0-NEXT: movq %r9, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: shl_32bytes_dwordOff: ; FALLBACK1: # %bb.0: ; FALLBACK1-NEXT: movq (%rdi), %rax ; FALLBACK1-NEXT: movq 8(%rdi), %r8 ; FALLBACK1-NEXT: movq 16(%rdi), %r9 ; FALLBACK1-NEXT: movq 24(%rdi), %rdi ; FALLBACK1-NEXT: movzbl (%rsi), %esi ; FALLBACK1-NEXT: movl %esi, %ecx ; FALLBACK1-NEXT: shlb $5, %cl ; FALLBACK1-NEXT: xorps %xmm0, %xmm0 ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: shlb $2, %sil ; FALLBACK1-NEXT: andb $24, %sil ; FALLBACK1-NEXT: negb %sil ; FALLBACK1-NEXT: movsbq %sil, %rax ; FALLBACK1-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK1-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK1-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK1-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK1-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK1-NEXT: shldq %cl, %rax, %rsi ; FALLBACK1-NEXT: shldq %cl, %r8, %rax ; FALLBACK1-NEXT: shlq %cl, %r8 ; FALLBACK1-NEXT: movq %rsi, 16(%rdx) ; FALLBACK1-NEXT: movq %rdi, 24(%rdx) ; FALLBACK1-NEXT: movq %r8, (%rdx) ; FALLBACK1-NEXT: movq %rax, 8(%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: shl_32bytes_dwordOff: ; FALLBACK2: # %bb.0: ; FALLBACK2-NEXT: movq (%rdi), %rcx ; FALLBACK2-NEXT: movq 8(%rdi), %r8 ; FALLBACK2-NEXT: movq 16(%rdi), %r9 ; FALLBACK2-NEXT: movq 24(%rdi), %rdi ; FALLBACK2-NEXT: movzbl (%rsi), %esi ; FALLBACK2-NEXT: movl %esi, %eax ; FALLBACK2-NEXT: shlb $5, %al ; FALLBACK2-NEXT: xorps %xmm0, %xmm0 ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: shlb $2, %sil ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: negb %sil ; FALLBACK2-NEXT: movsbq %sil, %rsi ; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi ; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %rcx ; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 ; FALLBACK2-NEXT: shlxq %rax, -16(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %rsi ; FALLBACK2-NEXT: shlxq %rax, %rsi, %r10 ; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK2-NEXT: orq %r8, %rdi ; FALLBACK2-NEXT: shrq %rsi ; FALLBACK2-NEXT: shrxq %rax, %rsi, %rsi ; FALLBACK2-NEXT: orq %r9, %rsi ; FALLBACK2-NEXT: shrq %rcx ; FALLBACK2-NEXT: shrxq %rax, %rcx, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, (%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) ; FALLBACK2-NEXT: movq %rsi, 24(%rdx) ; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_32bytes_dwordOff: ; FALLBACK3: # %bb.0: ; FALLBACK3-NEXT: movq (%rdi), %rax ; FALLBACK3-NEXT: movq 8(%rdi), %r8 ; FALLBACK3-NEXT: movq 16(%rdi), %r9 ; FALLBACK3-NEXT: movq 24(%rdi), %rdi ; FALLBACK3-NEXT: movzbl (%rsi), %esi ; FALLBACK3-NEXT: movl %esi, %ecx ; FALLBACK3-NEXT: shlb $5, %cl ; FALLBACK3-NEXT: xorps %xmm0, %xmm0 ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: shlb $2, %sil ; FALLBACK3-NEXT: andb $24, %sil ; FALLBACK3-NEXT: negb %sil ; FALLBACK3-NEXT: movsbq %sil, %rax ; FALLBACK3-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK3-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK3-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK3-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK3-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK3-NEXT: shldq %cl, %rax, %rsi ; FALLBACK3-NEXT: shldq %cl, %r8, %rax ; FALLBACK3-NEXT: shlxq %rcx, %r8, %rcx ; FALLBACK3-NEXT: movq %rsi, 16(%rdx) ; FALLBACK3-NEXT: movq %rdi, 24(%rdx) ; FALLBACK3-NEXT: movq %rcx, (%rdx) ; FALLBACK3-NEXT: movq %rax, 8(%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: shl_32bytes_dwordOff: ; FALLBACK4: # %bb.0: ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movzbl (%rsi), %ecx ; FALLBACK4-NEXT: movl %ecx, %eax ; FALLBACK4-NEXT: shlb $5, %al ; FALLBACK4-NEXT: xorps %xmm2, %xmm2 ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: shlb $2, %cl ; FALLBACK4-NEXT: andb $24, %cl ; FALLBACK4-NEXT: negb %cl ; FALLBACK4-NEXT: movsbq %cl, %r8 ; FALLBACK4-NEXT: movq -16(%rsp,%r8), %r9 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r9 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: movq -24(%rsp,%r8), %r10 ; FALLBACK4-NEXT: movq %r10, %rdi ; FALLBACK4-NEXT: shrq %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %rdi ; FALLBACK4-NEXT: orq %r9, %rdi ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: movq -40(%rsp,%r8), %r9 ; FALLBACK4-NEXT: movq -32(%rsp,%r8), %r8 ; FALLBACK4-NEXT: movq %r8, %r11 ; FALLBACK4-NEXT: shrq %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 ; FALLBACK4-NEXT: orq %r10, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r8 ; FALLBACK4-NEXT: movq %r9, %r10 ; FALLBACK4-NEXT: shrq %r10 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: orq %r8, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, (%rdx) ; FALLBACK4-NEXT: movq %r10, 8(%rdx) ; FALLBACK4-NEXT: movq %r11, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, 24(%rdx) ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: shl_32bytes_dwordOff: ; FALLBACK5: # %bb.0: ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK5-NEXT: movzbl (%rsi), %eax ; FALLBACK5-NEXT: movl %eax, %ecx ; FALLBACK5-NEXT: shlb $5, %cl ; FALLBACK5-NEXT: xorps %xmm2, %xmm2 ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: shlb $2, %al ; FALLBACK5-NEXT: andb $24, %al ; FALLBACK5-NEXT: negb %al ; FALLBACK5-NEXT: movsbq %al, %rax ; FALLBACK5-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK5-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK5-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK5-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK5-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK5-NEXT: shldq %cl, %rax, %rsi ; FALLBACK5-NEXT: movq %r8, %r9 ; FALLBACK5-NEXT: shlq %cl, %r9 ; FALLBACK5-NEXT: shldq %cl, %r8, %rax ; FALLBACK5-NEXT: movq %rax, 8(%rdx) ; FALLBACK5-NEXT: movq %rsi, 16(%rdx) ; FALLBACK5-NEXT: movq %rdi, 24(%rdx) ; FALLBACK5-NEXT: movq %r9, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: shl_32bytes_dwordOff: ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movzbl (%rsi), %ecx ; FALLBACK6-NEXT: movl %ecx, %eax ; FALLBACK6-NEXT: shlb $5, %al ; FALLBACK6-NEXT: xorps %xmm2, %xmm2 ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: shlb $2, %cl ; FALLBACK6-NEXT: andb $24, %cl ; FALLBACK6-NEXT: negb %cl ; FALLBACK6-NEXT: movsbq %cl, %rcx ; FALLBACK6-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi ; FALLBACK6-NEXT: movq -24(%rsp,%rcx), %rdi ; FALLBACK6-NEXT: shlxq %rax, %rdi, %r8 ; FALLBACK6-NEXT: movq -40(%rsp,%rcx), %r9 ; FALLBACK6-NEXT: movq -32(%rsp,%rcx), %rcx ; FALLBACK6-NEXT: shlxq %rax, %rcx, %r10 ; FALLBACK6-NEXT: shlxq %rax, %r9, %r11 ; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK6-NEXT: notb %al ; FALLBACK6-NEXT: shrq %rdi ; FALLBACK6-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %rsi, %rdi ; FALLBACK6-NEXT: shrq %rcx ; FALLBACK6-NEXT: shrxq %rax, %rcx, %rcx ; FALLBACK6-NEXT: orq %r8, %rcx ; FALLBACK6-NEXT: shrq %r9 ; FALLBACK6-NEXT: shrxq %rax, %r9, %rax ; FALLBACK6-NEXT: orq %r10, %rax ; FALLBACK6-NEXT: movq %r11, (%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) ; FALLBACK6-NEXT: movq %rcx, 16(%rdx) ; FALLBACK6-NEXT: movq %rdi, 24(%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_32bytes_dwordOff: ; FALLBACK7: # %bb.0: ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK7-NEXT: movzbl (%rsi), %eax ; FALLBACK7-NEXT: movl %eax, %ecx ; FALLBACK7-NEXT: shlb $5, %cl ; FALLBACK7-NEXT: xorps %xmm2, %xmm2 ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: shlb $2, %al ; FALLBACK7-NEXT: andb $24, %al ; FALLBACK7-NEXT: negb %al ; FALLBACK7-NEXT: movsbq %al, %rax ; FALLBACK7-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK7-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK7-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK7-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK7-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK7-NEXT: shldq %cl, %rax, %rsi ; FALLBACK7-NEXT: shlxq %rcx, %r8, %r9 ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK7-NEXT: shldq %cl, %r8, %rax ; FALLBACK7-NEXT: movq %rax, 8(%rdx) ; FALLBACK7-NEXT: movq %rsi, 16(%rdx) ; FALLBACK7-NEXT: movq %rdi, 24(%rdx) ; FALLBACK7-NEXT: movq %r9, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: shl_32bytes_dwordOff: ; FALLBACK8: # %bb.0: ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: movzbl (%rsi), %ecx ; FALLBACK8-NEXT: movl %ecx, %eax ; FALLBACK8-NEXT: shlb $5, %al ; FALLBACK8-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: shlb $2, %cl ; FALLBACK8-NEXT: andb $24, %cl ; FALLBACK8-NEXT: negb %cl ; FALLBACK8-NEXT: movsbq %cl, %r8 ; FALLBACK8-NEXT: movq -16(%rsp,%r8), %r9 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r9 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: movq -24(%rsp,%r8), %r10 ; FALLBACK8-NEXT: movq %r10, %rdi ; FALLBACK8-NEXT: shrq %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %rdi ; FALLBACK8-NEXT: orq %r9, %rdi ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: movq -40(%rsp,%r8), %r9 ; FALLBACK8-NEXT: movq -32(%rsp,%r8), %r8 ; FALLBACK8-NEXT: movq %r8, %r11 ; FALLBACK8-NEXT: shrq %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 ; FALLBACK8-NEXT: orq %r10, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r8 ; FALLBACK8-NEXT: movq %r9, %r10 ; FALLBACK8-NEXT: shrq %r10 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: orq %r8, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, (%rdx) ; FALLBACK8-NEXT: movq %r10, 8(%rdx) ; FALLBACK8-NEXT: movq %r11, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, 24(%rdx) ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: shl_32bytes_dwordOff: ; FALLBACK9: # %bb.0: ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: movzbl (%rsi), %eax ; FALLBACK9-NEXT: movl %eax, %ecx ; FALLBACK9-NEXT: shlb $5, %cl ; FALLBACK9-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: shlb $2, %al ; FALLBACK9-NEXT: andb $24, %al ; FALLBACK9-NEXT: negb %al ; FALLBACK9-NEXT: movsbq %al, %rax ; FALLBACK9-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK9-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK9-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK9-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK9-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK9-NEXT: shldq %cl, %rax, %rsi ; FALLBACK9-NEXT: movq %r8, %r9 ; FALLBACK9-NEXT: shlq %cl, %r9 ; FALLBACK9-NEXT: shldq %cl, %r8, %rax ; FALLBACK9-NEXT: movq %rax, 8(%rdx) ; FALLBACK9-NEXT: movq %rsi, 16(%rdx) ; FALLBACK9-NEXT: movq %rdi, 24(%rdx) ; FALLBACK9-NEXT: movq %r9, (%rdx) ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: shl_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: movzbl (%rsi), %ecx ; FALLBACK10-NEXT: movl %ecx, %eax ; FALLBACK10-NEXT: shlb $5, %al ; FALLBACK10-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: shlb $2, %cl ; FALLBACK10-NEXT: andb $24, %cl ; FALLBACK10-NEXT: negb %cl ; FALLBACK10-NEXT: movsbq %cl, %rcx ; FALLBACK10-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi ; FALLBACK10-NEXT: movq -24(%rsp,%rcx), %rdi ; FALLBACK10-NEXT: shlxq %rax, %rdi, %r8 ; FALLBACK10-NEXT: movq -40(%rsp,%rcx), %r9 ; FALLBACK10-NEXT: movq -32(%rsp,%rcx), %rcx ; FALLBACK10-NEXT: shlxq %rax, %rcx, %r10 ; FALLBACK10-NEXT: shlxq %rax, %r9, %r11 ; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK10-NEXT: notb %al ; FALLBACK10-NEXT: shrq %rdi ; FALLBACK10-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %rsi, %rdi ; FALLBACK10-NEXT: shrq %rcx ; FALLBACK10-NEXT: shrxq %rax, %rcx, %rcx ; FALLBACK10-NEXT: orq %r8, %rcx ; FALLBACK10-NEXT: shrq %r9 ; FALLBACK10-NEXT: shrxq %rax, %r9, %rax ; FALLBACK10-NEXT: orq %r10, %rax ; FALLBACK10-NEXT: movq %r11, (%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) ; FALLBACK10-NEXT: movq %rcx, 16(%rdx) ; FALLBACK10-NEXT: movq %rdi, 24(%rdx) ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: shl_32bytes_dwordOff: ; FALLBACK11: # %bb.0: ; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK11-NEXT: movzbl (%rsi), %eax ; FALLBACK11-NEXT: movl %eax, %ecx ; FALLBACK11-NEXT: shlb $5, %cl ; FALLBACK11-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: shlb $2, %al ; FALLBACK11-NEXT: andb $24, %al ; FALLBACK11-NEXT: negb %al ; FALLBACK11-NEXT: movsbq %al, %rax ; FALLBACK11-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK11-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK11-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK11-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK11-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK11-NEXT: shldq %cl, %rax, %rsi ; FALLBACK11-NEXT: shlxq %rcx, %r8, %r9 ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK11-NEXT: shldq %cl, %r8, %rax ; FALLBACK11-NEXT: movq %rax, 8(%rdx) ; FALLBACK11-NEXT: movq %rsi, 16(%rdx) ; FALLBACK11-NEXT: movq %rdi, 24(%rdx) ; FALLBACK11-NEXT: movq %r9, (%rdx) ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: shl_32bytes_dwordOff: ; FALLBACK12: # %bb.0: ; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK12-NEXT: movzbl (%rsi), %ecx ; FALLBACK12-NEXT: movl %ecx, %eax ; FALLBACK12-NEXT: shlb $5, %al ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: shlb $2, %cl ; FALLBACK12-NEXT: andb $24, %cl ; FALLBACK12-NEXT: negb %cl ; FALLBACK12-NEXT: movsbq %cl, %r8 ; FALLBACK12-NEXT: movq -16(%rsp,%r8), %r9 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r9 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: movq -24(%rsp,%r8), %r10 ; FALLBACK12-NEXT: movq %r10, %rdi ; FALLBACK12-NEXT: shrq %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %rdi ; FALLBACK12-NEXT: orq %r9, %rdi ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: movq -40(%rsp,%r8), %r9 ; FALLBACK12-NEXT: movq -32(%rsp,%r8), %r8 ; FALLBACK12-NEXT: movq %r8, %r11 ; FALLBACK12-NEXT: shrq %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 ; FALLBACK12-NEXT: orq %r10, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r8 ; FALLBACK12-NEXT: movq %r9, %r10 ; FALLBACK12-NEXT: shrq %r10 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: orq %r8, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, (%rdx) ; FALLBACK12-NEXT: movq %r10, 8(%rdx) ; FALLBACK12-NEXT: movq %r11, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, 24(%rdx) ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: shl_32bytes_dwordOff: ; FALLBACK13: # %bb.0: ; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK13-NEXT: movzbl (%rsi), %eax ; FALLBACK13-NEXT: movl %eax, %ecx ; FALLBACK13-NEXT: shlb $5, %cl ; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK13-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: shlb $2, %al ; FALLBACK13-NEXT: andb $24, %al ; FALLBACK13-NEXT: negb %al ; FALLBACK13-NEXT: movsbq %al, %rax ; FALLBACK13-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK13-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK13-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK13-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK13-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK13-NEXT: shldq %cl, %rax, %rsi ; FALLBACK13-NEXT: movq %r8, %r9 ; FALLBACK13-NEXT: shlq %cl, %r9 ; FALLBACK13-NEXT: shldq %cl, %r8, %rax ; FALLBACK13-NEXT: movq %rax, 8(%rdx) ; FALLBACK13-NEXT: movq %rsi, 16(%rdx) ; FALLBACK13-NEXT: movq %rdi, 24(%rdx) ; FALLBACK13-NEXT: movq %r9, (%rdx) ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: shl_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK14-NEXT: movzbl (%rsi), %ecx ; FALLBACK14-NEXT: movl %ecx, %eax ; FALLBACK14-NEXT: shlb $5, %al ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: shlb $2, %cl ; FALLBACK14-NEXT: andb $24, %cl ; FALLBACK14-NEXT: negb %cl ; FALLBACK14-NEXT: movsbq %cl, %rcx ; FALLBACK14-NEXT: shlxq %rax, -16(%rsp,%rcx), %rsi ; FALLBACK14-NEXT: movq -24(%rsp,%rcx), %rdi ; FALLBACK14-NEXT: shlxq %rax, %rdi, %r8 ; FALLBACK14-NEXT: movq -40(%rsp,%rcx), %r9 ; FALLBACK14-NEXT: movq -32(%rsp,%rcx), %rcx ; FALLBACK14-NEXT: shlxq %rax, %rcx, %r10 ; FALLBACK14-NEXT: shlxq %rax, %r9, %r11 ; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK14-NEXT: notb %al ; FALLBACK14-NEXT: shrq %rdi ; FALLBACK14-NEXT: shrxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %rsi, %rdi ; FALLBACK14-NEXT: shrq %rcx ; FALLBACK14-NEXT: shrxq %rax, %rcx, %rcx ; FALLBACK14-NEXT: orq %r8, %rcx ; FALLBACK14-NEXT: shrq %r9 ; FALLBACK14-NEXT: shrxq %rax, %r9, %rax ; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: movq %r11, (%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) ; FALLBACK14-NEXT: movq %rcx, 16(%rdx) ; FALLBACK14-NEXT: movq %rdi, 24(%rdx) ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: shl_32bytes_dwordOff: ; FALLBACK15: # %bb.0: ; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK15-NEXT: movzbl (%rsi), %eax ; FALLBACK15-NEXT: movl %eax, %ecx ; FALLBACK15-NEXT: shlb $5, %cl ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: shlb $2, %al ; FALLBACK15-NEXT: andb $24, %al ; FALLBACK15-NEXT: negb %al ; FALLBACK15-NEXT: movsbq %al, %rax ; FALLBACK15-NEXT: movq -24(%rsp,%rax), %rsi ; FALLBACK15-NEXT: movq -16(%rsp,%rax), %rdi ; FALLBACK15-NEXT: shldq %cl, %rsi, %rdi ; FALLBACK15-NEXT: movq -40(%rsp,%rax), %r8 ; FALLBACK15-NEXT: movq -32(%rsp,%rax), %rax ; FALLBACK15-NEXT: shldq %cl, %rax, %rsi ; FALLBACK15-NEXT: shlxq %rcx, %r8, %r9 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK15-NEXT: shldq %cl, %r8, %rax ; FALLBACK15-NEXT: movq %rax, 8(%rdx) ; FALLBACK15-NEXT: movq %rsi, 16(%rdx) ; FALLBACK15-NEXT: movq %rdi, 24(%rdx) ; FALLBACK15-NEXT: movq %r9, (%rdx) ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; ; X86-SSE2-LABEL: shl_32bytes_dwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $92, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SSE2-NEXT: movl (%ebp), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%ebp), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%ebp), %esi ; X86-SSE2-NEXT: movl 12(%ebp), %edi ; X86-SSE2-NEXT: movl 16(%ebp), %ebx ; X86-SSE2-NEXT: movzbl (%ecx), %ecx ; X86-SSE2-NEXT: movl 20(%ebp), %edx ; X86-SSE2-NEXT: movl 24(%ebp), %eax ; X86-SSE2-NEXT: movl 28(%ebp), %ebp ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: xorps %xmm0, %xmm0 ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: shlb $2, %cl ; X86-SSE2-NEXT: andb $28, %cl ; X86-SSE2-NEXT: negb %cl ; X86-SSE2-NEXT: movsbl %cl, %edx ; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi ; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi ; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx ; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp ; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx ; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %edx, 24(%eax) ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl %ebp, 16(%eax) ; X86-SSE2-NEXT: movl %ebx, 20(%eax) ; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: addl $92, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: shl_32bytes_dwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: subl $76, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm2, %xmm2 ; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm2, (%esp) ; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: shlb $2, %cl ; X86-SSE42-NEXT: andb $28, %cl ; X86-SSE42-NEXT: negb %cl ; X86-SSE42-NEXT: movsbl %cl, %ecx ; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0 ; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $76, %esp ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: shl_32bytes_dwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: subl $76, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %ymm0 ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vmovups %ymm1, (%esp) ; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: shlb $2, %cl ; X86-AVX-NEXT: andb $28, %cl ; X86-AVX-NEXT: negb %cl ; X86-AVX-NEXT: movsbl %cl, %ecx ; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0 ; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $76, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 %bitOff = shl i256 %dwordOff, 5 %res = shl i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @shl_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: shl_32bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movq (%rdi), %rax ; X64-SSE2-NEXT: movq 8(%rdi), %rcx ; X64-SSE2-NEXT: movq 16(%rdi), %r8 ; X64-SSE2-NEXT: movq 24(%rdi), %rdi ; X64-SSE2-NEXT: movzbl (%rsi), %esi ; X64-SSE2-NEXT: xorps %xmm0, %xmm0 ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: shlb $3, %sil ; X64-SSE2-NEXT: andb $24, %sil ; X64-SSE2-NEXT: negb %sil ; X64-SSE2-NEXT: movsbq %sil, %rax ; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rcx ; X64-SSE2-NEXT: movq -32(%rsp,%rax), %rsi ; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rdi ; X64-SSE2-NEXT: movq -24(%rsp,%rax), %rax ; X64-SSE2-NEXT: movq %rax, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) ; X64-SSE2-NEXT: movq %rcx, (%rdx) ; X64-SSE2-NEXT: movq %rsi, 8(%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: shl_32bytes_qwordOff: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movzbl (%rsi), %eax ; X64-SSE42-NEXT: xorps %xmm2, %xmm2 ; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: shlb $3, %al ; X64-SSE42-NEXT: andb $24, %al ; X64-SSE42-NEXT: negb %al ; X64-SSE42-NEXT: movsbq %al, %rax ; X64-SSE42-NEXT: movups -40(%rsp,%rax), %xmm0 ; X64-SSE42-NEXT: movups -24(%rsp,%rax), %xmm1 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: shl_32bytes_qwordOff: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: movzbl (%rsi), %eax ; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: shlb $3, %al ; X64-AVX-NEXT: andb $24, %al ; X64-AVX-NEXT: negb %al ; X64-AVX-NEXT: movsbq %al, %rax ; X64-AVX-NEXT: vmovups -40(%rsp,%rax), %xmm0 ; X64-AVX-NEXT: vmovups -24(%rsp,%rax), %xmm1 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; ; X86-SSE2-LABEL: shl_32bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $92, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebp ; X86-SSE2-NEXT: movl (%ebp), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%ebp), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%ebp), %esi ; X86-SSE2-NEXT: movl 12(%ebp), %edi ; X86-SSE2-NEXT: movl 16(%ebp), %ebx ; X86-SSE2-NEXT: movzbl (%ecx), %ecx ; X86-SSE2-NEXT: movl 20(%ebp), %edx ; X86-SSE2-NEXT: movl 24(%ebp), %eax ; X86-SSE2-NEXT: movl 28(%ebp), %ebp ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: xorps %xmm0, %xmm0 ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: shlb $3, %cl ; X86-SSE2-NEXT: andb $24, %cl ; X86-SSE2-NEXT: negb %cl ; X86-SSE2-NEXT: movsbl %cl, %edx ; X86-SSE2-NEXT: movl 48(%esp,%edx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 52(%esp,%edx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 60(%esp,%edx), %esi ; X86-SSE2-NEXT: movl 56(%esp,%edx), %edi ; X86-SSE2-NEXT: movl 68(%esp,%edx), %ebx ; X86-SSE2-NEXT: movl 64(%esp,%edx), %ebp ; X86-SSE2-NEXT: movl 76(%esp,%edx), %ecx ; X86-SSE2-NEXT: movl 72(%esp,%edx), %edx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %edx, 24(%eax) ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl %ebp, 16(%eax) ; X86-SSE2-NEXT: movl %ebx, 20(%eax) ; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: addl $92, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: shl_32bytes_qwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: subl $76, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm2, %xmm2 ; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm2, (%esp) ; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: shlb $3, %cl ; X86-SSE42-NEXT: andb $24, %cl ; X86-SSE42-NEXT: negb %cl ; X86-SSE42-NEXT: movsbl %cl, %ecx ; X86-SSE42-NEXT: movups 32(%esp,%ecx), %xmm0 ; X86-SSE42-NEXT: movups 48(%esp,%ecx), %xmm1 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $76, %esp ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: shl_32bytes_qwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: subl $76, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %ymm0 ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX-NEXT: vmovups %ymm1, (%esp) ; X86-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: shlb $3, %cl ; X86-AVX-NEXT: andb $24, %cl ; X86-AVX-NEXT: negb %cl ; X86-AVX-NEXT: movsbl %cl, %ecx ; X86-AVX-NEXT: vmovups 32(%esp,%ecx), %xmm0 ; X86-AVX-NEXT: vmovups 48(%esp,%ecx), %xmm1 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $76, %esp ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %qwordOff = load i256, ptr %qwordOff.ptr, align 1 %bitOff = shl i256 %qwordOff, 6 %res = shl i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @ashr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: ashr_32bytes: ; FALLBACK0: # %bb.0: ; FALLBACK0-NEXT: pushq %rbx ; FALLBACK0-NEXT: movq (%rdi), %rcx ; FALLBACK0-NEXT: movq 8(%rdi), %r8 ; FALLBACK0-NEXT: movq 16(%rdi), %r9 ; FALLBACK0-NEXT: movq 24(%rdi), %rdi ; FALLBACK0-NEXT: movzbl (%rsi), %esi ; FALLBACK0-NEXT: leal (,%rsi,8), %eax ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: sarq $63, %rdi ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: andb $24, %sil ; FALLBACK0-NEXT: movzbl %sil, %r9d ; FALLBACK0-NEXT: movq -64(%rsp,%r9), %r10 ; FALLBACK0-NEXT: movq -56(%rsp,%r9), %rdi ; FALLBACK0-NEXT: movq %rdi, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq -48(%rsp,%r9), %rbx ; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 ; FALLBACK0-NEXT: orq %r11, %r8 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r10 ; FALLBACK0-NEXT: addq %rdi, %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rdi ; FALLBACK0-NEXT: orq %r10, %rdi ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx ; FALLBACK0-NEXT: movq -40(%rsp,%r9), %r9 ; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: orq %rbx, %r10 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: sarq %cl, %r9 ; FALLBACK0-NEXT: movq %r9, 24(%rdx) ; FALLBACK0-NEXT: movq %r10, 16(%rdx) ; FALLBACK0-NEXT: movq %rdi, (%rdx) ; FALLBACK0-NEXT: movq %r8, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: ashr_32bytes: ; FALLBACK1: # %bb.0: ; FALLBACK1-NEXT: movq (%rdi), %rax ; FALLBACK1-NEXT: movq 8(%rdi), %r8 ; FALLBACK1-NEXT: movq 16(%rdi), %r9 ; FALLBACK1-NEXT: movq 24(%rdi), %rdi ; FALLBACK1-NEXT: movzbl (%rsi), %esi ; FALLBACK1-NEXT: leal (,%rsi,8), %ecx ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: sarq $63, %rdi ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: andb $24, %sil ; FALLBACK1-NEXT: movzbl %sil, %eax ; FALLBACK1-NEXT: movq -56(%rsp,%rax), %rsi ; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rdi ; FALLBACK1-NEXT: movq -64(%rsp,%rax), %r8 ; FALLBACK1-NEXT: movq %r8, %r9 ; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 ; FALLBACK1-NEXT: movq -48(%rsp,%rax), %rax ; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi ; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK1-NEXT: sarq %cl, %rax ; FALLBACK1-NEXT: movq %rsi, 16(%rdx) ; FALLBACK1-NEXT: movq %rax, 24(%rdx) ; FALLBACK1-NEXT: movq %rdi, (%rdx) ; FALLBACK1-NEXT: movq %r9, 8(%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: ashr_32bytes: ; FALLBACK2: # %bb.0: ; FALLBACK2-NEXT: movq (%rdi), %rcx ; FALLBACK2-NEXT: movq 8(%rdi), %r8 ; FALLBACK2-NEXT: movq 16(%rdi), %r9 ; FALLBACK2-NEXT: movq 24(%rdi), %rdi ; FALLBACK2-NEXT: movzbl (%rsi), %esi ; FALLBACK2-NEXT: leal (,%rsi,8), %eax ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: sarq $63, %rdi ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: andb $24, %sil ; FALLBACK2-NEXT: movzbl %sil, %ecx ; FALLBACK2-NEXT: movq -64(%rsp,%rcx), %rsi ; FALLBACK2-NEXT: movq -56(%rsp,%rcx), %rdi ; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 ; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx), %r9 ; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK2-NEXT: movq -48(%rsp,%rcx), %rcx ; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK2-NEXT: orq %r8, %rdi ; FALLBACK2-NEXT: addq %rsi, %rsi ; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi ; FALLBACK2-NEXT: orq %r9, %rsi ; FALLBACK2-NEXT: addq %rcx, %rcx ; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) ; FALLBACK2-NEXT: movq %rsi, (%rdx) ; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes: ; FALLBACK3: # %bb.0: ; FALLBACK3-NEXT: movq (%rdi), %rax ; FALLBACK3-NEXT: movq 8(%rdi), %r8 ; FALLBACK3-NEXT: movq 16(%rdi), %r9 ; FALLBACK3-NEXT: movq 24(%rdi), %rdi ; FALLBACK3-NEXT: movzbl (%rsi), %esi ; FALLBACK3-NEXT: leal (,%rsi,8), %ecx ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: sarq $63, %rdi ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: andb $24, %sil ; FALLBACK3-NEXT: movzbl %sil, %eax ; FALLBACK3-NEXT: movq -56(%rsp,%rax), %rsi ; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rdi ; FALLBACK3-NEXT: movq -64(%rsp,%rax), %r8 ; FALLBACK3-NEXT: movq %r8, %r9 ; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 ; FALLBACK3-NEXT: movq -48(%rsp,%rax), %rax ; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi ; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi ; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax ; FALLBACK3-NEXT: movq %rsi, 16(%rdx) ; FALLBACK3-NEXT: movq %rax, 24(%rdx) ; FALLBACK3-NEXT: movq %rdi, (%rdx) ; FALLBACK3-NEXT: movq %r9, 8(%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: ashr_32bytes: ; FALLBACK4: # %bb.0: ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movq 16(%rdi), %rcx ; FALLBACK4-NEXT: movq 24(%rdi), %rdi ; FALLBACK4-NEXT: movzbl (%rsi), %esi ; FALLBACK4-NEXT: leal (,%rsi,8), %eax ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: sarq $63, %rdi ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: andb $24, %sil ; FALLBACK4-NEXT: movzbl %sil, %r9d ; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r10 ; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r8 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi ; FALLBACK4-NEXT: orq %r10, %rdi ; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r10 ; FALLBACK4-NEXT: movq %r10, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 ; FALLBACK4-NEXT: movq -40(%rsp,%r9), %r9 ; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rbx ; FALLBACK4-NEXT: orq %r11, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 ; FALLBACK4-NEXT: addq %r10, %r10 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: orq %r8, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: sarq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, 24(%rdx) ; FALLBACK4-NEXT: movq %r10, 8(%rdx) ; FALLBACK4-NEXT: movq %rbx, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: ashr_32bytes: ; FALLBACK5: # %bb.0: ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movq 16(%rdi), %rax ; FALLBACK5-NEXT: movq 24(%rdi), %rdi ; FALLBACK5-NEXT: movzbl (%rsi), %esi ; FALLBACK5-NEXT: leal (,%rsi,8), %ecx ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: sarq $63, %rdi ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: andb $24, %sil ; FALLBACK5-NEXT: movzbl %sil, %eax ; FALLBACK5-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK5-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK5-NEXT: movq %rdi, %r8 ; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK5-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK5-NEXT: movq %rax, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: sarq %cl, %rsi ; FALLBACK5-NEXT: movq %r10, 8(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) ; FALLBACK5-NEXT: movq %r9, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: ashr_32bytes: ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movq 16(%rdi), %rcx ; FALLBACK6-NEXT: movq 24(%rdi), %rdi ; FALLBACK6-NEXT: movzbl (%rsi), %esi ; FALLBACK6-NEXT: leal (,%rsi,8), %eax ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: sarq $63, %rdi ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: andb $24, %sil ; FALLBACK6-NEXT: movzbl %sil, %ecx ; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi ; FALLBACK6-NEXT: movq -64(%rsp,%rcx), %rdi ; FALLBACK6-NEXT: movq -56(%rsp,%rcx), %r8 ; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK6-NEXT: movq -48(%rsp,%rcx), %rcx ; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK6-NEXT: notb %al ; FALLBACK6-NEXT: addq %rdi, %rdi ; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %rsi, %rdi ; FALLBACK6-NEXT: addq %rcx, %rcx ; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK6-NEXT: orq %r9, %rcx ; FALLBACK6-NEXT: addq %r8, %r8 ; FALLBACK6-NEXT: shlxq %rax, %r8, %rax ; FALLBACK6-NEXT: orq %r10, %rax ; FALLBACK6-NEXT: movq %r11, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) ; FALLBACK6-NEXT: movq %rcx, 16(%rdx) ; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes: ; FALLBACK7: # %bb.0: ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movq 16(%rdi), %rax ; FALLBACK7-NEXT: movq 24(%rdi), %rdi ; FALLBACK7-NEXT: movzbl (%rsi), %esi ; FALLBACK7-NEXT: leal (,%rsi,8), %ecx ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: sarq $63, %rdi ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: andb $24, %sil ; FALLBACK7-NEXT: movzbl %sil, %eax ; FALLBACK7-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK7-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK7-NEXT: movq %rdi, %r8 ; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK7-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK7-NEXT: movq %rax, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax ; FALLBACK7-NEXT: movq %r10, 8(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rax, 24(%rdx) ; FALLBACK7-NEXT: movq %r9, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: ashr_32bytes: ; FALLBACK8: # %bb.0: ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK8-NEXT: movq 16(%rdi), %rcx ; FALLBACK8-NEXT: movq 24(%rdi), %rdi ; FALLBACK8-NEXT: movzbl (%rsi), %esi ; FALLBACK8-NEXT: leal (,%rsi,8), %eax ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: sarq $63, %rdi ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: andb $24, %sil ; FALLBACK8-NEXT: movzbl %sil, %r9d ; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r10 ; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r8 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi ; FALLBACK8-NEXT: orq %r10, %rdi ; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r10 ; FALLBACK8-NEXT: movq %r10, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 ; FALLBACK8-NEXT: movq -40(%rsp,%r9), %r9 ; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rbx ; FALLBACK8-NEXT: orq %r11, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 ; FALLBACK8-NEXT: addq %r10, %r10 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: orq %r8, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: sarq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, 24(%rdx) ; FALLBACK8-NEXT: movq %r10, 8(%rdx) ; FALLBACK8-NEXT: movq %rbx, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: ashr_32bytes: ; FALLBACK9: # %bb.0: ; FALLBACK9-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK9-NEXT: movq 16(%rdi), %rax ; FALLBACK9-NEXT: movq 24(%rdi), %rdi ; FALLBACK9-NEXT: movzbl (%rsi), %esi ; FALLBACK9-NEXT: leal (,%rsi,8), %ecx ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: sarq $63, %rdi ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: andb $24, %sil ; FALLBACK9-NEXT: movzbl %sil, %eax ; FALLBACK9-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK9-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK9-NEXT: movq %rdi, %r8 ; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK9-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK9-NEXT: movq %rax, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: sarq %cl, %rsi ; FALLBACK9-NEXT: movq %r10, 8(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) ; FALLBACK9-NEXT: movq %r9, (%rdx) ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: ashr_32bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK10-NEXT: movq 16(%rdi), %rcx ; FALLBACK10-NEXT: movq 24(%rdi), %rdi ; FALLBACK10-NEXT: movzbl (%rsi), %esi ; FALLBACK10-NEXT: leal (,%rsi,8), %eax ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: sarq $63, %rdi ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: andb $24, %sil ; FALLBACK10-NEXT: movzbl %sil, %ecx ; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi ; FALLBACK10-NEXT: movq -64(%rsp,%rcx), %rdi ; FALLBACK10-NEXT: movq -56(%rsp,%rcx), %r8 ; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK10-NEXT: movq -48(%rsp,%rcx), %rcx ; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK10-NEXT: notb %al ; FALLBACK10-NEXT: addq %rdi, %rdi ; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %rsi, %rdi ; FALLBACK10-NEXT: addq %rcx, %rcx ; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK10-NEXT: orq %r9, %rcx ; FALLBACK10-NEXT: addq %r8, %r8 ; FALLBACK10-NEXT: shlxq %rax, %r8, %rax ; FALLBACK10-NEXT: orq %r10, %rax ; FALLBACK10-NEXT: movq %r11, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) ; FALLBACK10-NEXT: movq %rcx, 16(%rdx) ; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes: ; FALLBACK11: # %bb.0: ; FALLBACK11-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK11-NEXT: movq 16(%rdi), %rax ; FALLBACK11-NEXT: movq 24(%rdi), %rdi ; FALLBACK11-NEXT: movzbl (%rsi), %esi ; FALLBACK11-NEXT: leal (,%rsi,8), %ecx ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: sarq $63, %rdi ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: andb $24, %sil ; FALLBACK11-NEXT: movzbl %sil, %eax ; FALLBACK11-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK11-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK11-NEXT: movq %rdi, %r8 ; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK11-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK11-NEXT: movq %rax, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax ; FALLBACK11-NEXT: movq %r10, 8(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) ; FALLBACK11-NEXT: movq %rax, 24(%rdx) ; FALLBACK11-NEXT: movq %r9, (%rdx) ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: ashr_32bytes: ; FALLBACK12: # %bb.0: ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK12-NEXT: movq 16(%rdi), %rcx ; FALLBACK12-NEXT: movq 24(%rdi), %rdi ; FALLBACK12-NEXT: movzbl (%rsi), %esi ; FALLBACK12-NEXT: leal (,%rsi,8), %eax ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: sarq $63, %rdi ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: andb $24, %sil ; FALLBACK12-NEXT: movzbl %sil, %r9d ; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r10 ; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi ; FALLBACK12-NEXT: orq %r10, %rdi ; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r10 ; FALLBACK12-NEXT: movq %r10, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 ; FALLBACK12-NEXT: movq -40(%rsp,%r9), %r9 ; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r11, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 ; FALLBACK12-NEXT: addq %r10, %r10 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: orq %r8, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: sarq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 24(%rdx) ; FALLBACK12-NEXT: movq %r10, 8(%rdx) ; FALLBACK12-NEXT: movq %rbx, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: ashr_32bytes: ; FALLBACK13: # %bb.0: ; FALLBACK13-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK13-NEXT: movq 16(%rdi), %rax ; FALLBACK13-NEXT: movq 24(%rdi), %rdi ; FALLBACK13-NEXT: movzbl (%rsi), %esi ; FALLBACK13-NEXT: leal (,%rsi,8), %ecx ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: sarq $63, %rdi ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: andb $24, %sil ; FALLBACK13-NEXT: movzbl %sil, %eax ; FALLBACK13-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK13-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK13-NEXT: movq %rdi, %r8 ; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK13-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK13-NEXT: movq %rax, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: sarq %cl, %rsi ; FALLBACK13-NEXT: movq %r10, 8(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) ; FALLBACK13-NEXT: movq %r9, (%rdx) ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: ashr_32bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK14-NEXT: movq 16(%rdi), %rcx ; FALLBACK14-NEXT: movq 24(%rdi), %rdi ; FALLBACK14-NEXT: movzbl (%rsi), %esi ; FALLBACK14-NEXT: leal (,%rsi,8), %eax ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: sarq $63, %rdi ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: andb $24, %sil ; FALLBACK14-NEXT: movzbl %sil, %ecx ; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx), %rsi ; FALLBACK14-NEXT: movq -64(%rsp,%rcx), %rdi ; FALLBACK14-NEXT: movq -56(%rsp,%rcx), %r8 ; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK14-NEXT: movq -48(%rsp,%rcx), %rcx ; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK14-NEXT: notb %al ; FALLBACK14-NEXT: addq %rdi, %rdi ; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %rsi, %rdi ; FALLBACK14-NEXT: addq %rcx, %rcx ; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK14-NEXT: orq %r9, %rcx ; FALLBACK14-NEXT: addq %r8, %r8 ; FALLBACK14-NEXT: shlxq %rax, %r8, %rax ; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: movq %r11, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) ; FALLBACK14-NEXT: movq %rcx, 16(%rdx) ; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes: ; FALLBACK15: # %bb.0: ; FALLBACK15-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK15-NEXT: movq 16(%rdi), %rax ; FALLBACK15-NEXT: movq 24(%rdi), %rdi ; FALLBACK15-NEXT: movzbl (%rsi), %esi ; FALLBACK15-NEXT: leal (,%rsi,8), %ecx ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: sarq $63, %rdi ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: andb $24, %sil ; FALLBACK15-NEXT: movzbl %sil, %eax ; FALLBACK15-NEXT: movq -48(%rsp,%rax), %rsi ; FALLBACK15-NEXT: movq -56(%rsp,%rax), %rdi ; FALLBACK15-NEXT: movq %rdi, %r8 ; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r9 ; FALLBACK15-NEXT: movq -64(%rsp,%rax), %rax ; FALLBACK15-NEXT: movq %rax, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax ; FALLBACK15-NEXT: movq %r10, 8(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) ; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r9, (%rdx) ; FALLBACK15-NEXT: retq ; ; FALLBACK16-LABEL: ashr_32bytes: ; FALLBACK16: # %bb.0: ; FALLBACK16-NEXT: pushl %ebp ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $108, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK16-NEXT: movl (%esi), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 4(%esi), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 8(%esi), %ebx ; FALLBACK16-NEXT: movl 12(%esi), %ebp ; FALLBACK16-NEXT: movl 16(%esi), %edi ; FALLBACK16-NEXT: movzbl (%eax), %ecx ; FALLBACK16-NEXT: movl 20(%esi), %edx ; FALLBACK16-NEXT: movl 24(%esi), %eax ; FALLBACK16-NEXT: movl 28(%esi), %esi ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, %edx ; FALLBACK16-NEXT: shlb $3, %dl ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: sarl $31, %esi ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: andb $28, %cl ; FALLBACK16-NEXT: movzbl %cl, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 32(%esp,%edi), %esi ; FALLBACK16-NEXT: movl 36(%esp,%edi), %eax ; FALLBACK16-NEXT: movl %eax, %ebx ; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movb %dl, %ch ; FALLBACK16-NEXT: notb %ch ; FALLBACK16-NEXT: movl 40(%esp,%edi), %edi ; FALLBACK16-NEXT: leal (%edi,%edi), %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %ebx, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: addl %eax, %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: orl %esi, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl 44(%esp,%eax), %ebp ; FALLBACK16-NEXT: movl %ebp, %esi ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: movl %edx, %ebx ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: movl 48(%esp,%eax), %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: leal (%edx,%edx), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: orl %esi, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %ebx, %edx ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %edi, %ebp ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK16-NEXT: movl 52(%esp,%esi), %edi ; FALLBACK16-NEXT: movl %edi, %eax ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl 56(%esp,%esi), %ebx ; FALLBACK16-NEXT: leal (%ebx,%ebx), %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: orl %eax, %esi ; FALLBACK16-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: addl %edi, %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: orl %eax, %edi ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl 60(%esp,%eax), %eax ; FALLBACK16-NEXT: leal (%eax,%eax), %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: orl %ebx, %edx ; FALLBACK16-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; FALLBACK16-NEXT: sarl %cl, %eax ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK16-NEXT: movl %eax, 28(%ecx) ; FALLBACK16-NEXT: movl %edx, 24(%ecx) ; FALLBACK16-NEXT: movl %edi, 16(%ecx) ; FALLBACK16-NEXT: movl %esi, 20(%ecx) ; FALLBACK16-NEXT: movl %ebp, 8(%ecx) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, 12(%ecx) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, (%ecx) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, 4(%ecx) ; FALLBACK16-NEXT: addl $108, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx ; FALLBACK16-NEXT: popl %ebp ; FALLBACK16-NEXT: retl ; ; FALLBACK17-LABEL: ashr_32bytes: ; FALLBACK17: # %bb.0: ; FALLBACK17-NEXT: pushl %ebp ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $92, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl (%ecx), %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 4(%ecx), %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 8(%ecx), %edx ; FALLBACK17-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: movl 12(%ecx), %ebp ; FALLBACK17-NEXT: movl 16(%ecx), %ebx ; FALLBACK17-NEXT: movzbl (%eax), %eax ; FALLBACK17-NEXT: movl 20(%ecx), %edi ; FALLBACK17-NEXT: movl 24(%ecx), %edx ; FALLBACK17-NEXT: movl 28(%ecx), %esi ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, %ecx ; FALLBACK17-NEXT: shlb $3, %cl ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: sarl $31, %esi ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: andb $28, %al ; FALLBACK17-NEXT: movzbl %al, %ebp ; FALLBACK17-NEXT: movl 24(%esp,%ebp), %edx ; FALLBACK17-NEXT: movl 20(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %edx, %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 32(%esp,%ebp), %ebx ; FALLBACK17-NEXT: movl 28(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %esi ; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK17-NEXT: movl %esi, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 40(%esp,%ebp), %edx ; FALLBACK17-NEXT: movl 36(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edi ; FALLBACK17-NEXT: shrdl %cl, %edx, %edi ; FALLBACK17-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK17-NEXT: movl 16(%esp,%ebp), %esi ; FALLBACK17-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK17-NEXT: movl %edx, 24(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: shrdl %cl, %edx, %esi ; FALLBACK17-NEXT: sarl %cl, %eax ; FALLBACK17-NEXT: movl %eax, 28(%ebp) ; FALLBACK17-NEXT: movl %ebx, 16(%ebp) ; FALLBACK17-NEXT: movl %edi, 20(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 8(%ebp) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 12(%ebp) ; FALLBACK17-NEXT: movl %esi, (%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 4(%ebp) ; FALLBACK17-NEXT: addl $92, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx ; FALLBACK17-NEXT: popl %ebp ; FALLBACK17-NEXT: retl ; ; FALLBACK18-LABEL: ashr_32bytes: ; FALLBACK18: # %bb.0: ; FALLBACK18-NEXT: pushl %ebp ; FALLBACK18-NEXT: pushl %ebx ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $108, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %esi ; FALLBACK18-NEXT: movl (%esi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 4(%esi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 8(%esi), %ebx ; FALLBACK18-NEXT: movl 12(%esi), %ebp ; FALLBACK18-NEXT: movl 16(%esi), %edi ; FALLBACK18-NEXT: movzbl (%ecx), %ecx ; FALLBACK18-NEXT: movl 20(%esi), %edx ; FALLBACK18-NEXT: movl 24(%esi), %eax ; FALLBACK18-NEXT: movl 28(%esi), %esi ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, %eax ; FALLBACK18-NEXT: shlb $3, %al ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: sarl $31, %esi ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: andb $28, %cl ; FALLBACK18-NEXT: movzbl %cl, %edi ; FALLBACK18-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK18-NEXT: movl 40(%esp,%edi), %ecx ; FALLBACK18-NEXT: shrxl %eax, %esi, %ebx ; FALLBACK18-NEXT: movl %eax, %edx ; FALLBACK18-NEXT: notb %dl ; FALLBACK18-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK18-NEXT: shlxl %edx, %ebp, %ebp ; FALLBACK18-NEXT: orl %ebx, %ebp ; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %eax, 32(%esp,%edi), %ebx ; FALLBACK18-NEXT: addl %esi, %esi ; FALLBACK18-NEXT: shlxl %edx, %esi, %esi ; FALLBACK18-NEXT: orl %ebx, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 48(%esp,%edi), %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%esi,%esi), %ebx ; FALLBACK18-NEXT: shlxl %edx, %ebx, %esi ; FALLBACK18-NEXT: movl 44(%esp,%edi), %ebp ; FALLBACK18-NEXT: shrxl %eax, %ebp, %ebx ; FALLBACK18-NEXT: orl %ebx, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %eax, %ecx, %ecx ; FALLBACK18-NEXT: movl %eax, %ebx ; FALLBACK18-NEXT: addl %ebp, %ebp ; FALLBACK18-NEXT: shlxl %edx, %ebp, %eax ; FALLBACK18-NEXT: orl %ecx, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 56(%esp,%edi), %ebp ; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx ; FALLBACK18-NEXT: shlxl %edx, %ecx, %ecx ; FALLBACK18-NEXT: movl 52(%esp,%edi), %eax ; FALLBACK18-NEXT: shrxl %ebx, %eax, %esi ; FALLBACK18-NEXT: orl %esi, %ecx ; FALLBACK18-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: addl %eax, %eax ; FALLBACK18-NEXT: shlxl %edx, %eax, %esi ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK18-NEXT: shrxl %ebx, %ebp, %eax ; FALLBACK18-NEXT: movl 60(%esp,%edi), %edi ; FALLBACK18-NEXT: sarxl %ebx, %edi, %ebx ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %edx, %edi, %edx ; FALLBACK18-NEXT: orl %eax, %edx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl %ebx, 28(%eax) ; FALLBACK18-NEXT: movl %edx, 24(%eax) ; FALLBACK18-NEXT: movl %esi, 16(%eax) ; FALLBACK18-NEXT: movl %ecx, 20(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 8(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 12(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, (%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 4(%eax) ; FALLBACK18-NEXT: addl $108, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi ; FALLBACK18-NEXT: popl %ebx ; FALLBACK18-NEXT: popl %ebp ; FALLBACK18-NEXT: retl ; ; FALLBACK19-LABEL: ashr_32bytes: ; FALLBACK19: # %bb.0: ; FALLBACK19-NEXT: pushl %ebp ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $92, %esp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl (%ecx), %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 4(%ecx), %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 8(%ecx), %edx ; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: movl 12(%ecx), %ebp ; FALLBACK19-NEXT: movl 16(%ecx), %ebx ; FALLBACK19-NEXT: movzbl (%eax), %eax ; FALLBACK19-NEXT: movl 20(%ecx), %edi ; FALLBACK19-NEXT: movl 24(%ecx), %edx ; FALLBACK19-NEXT: movl 28(%ecx), %esi ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, %ecx ; FALLBACK19-NEXT: shlb $3, %cl ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: sarl $31, %esi ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: andb $28, %al ; FALLBACK19-NEXT: movzbl %al, %ebp ; FALLBACK19-NEXT: movl 24(%esp,%ebp), %esi ; FALLBACK19-NEXT: movl 20(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %esi, %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 32(%esp,%ebp), %ebx ; FALLBACK19-NEXT: movl 28(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 40(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl 36(%esp,%ebp), %edx ; FALLBACK19-NEXT: movl %edx, %esi ; FALLBACK19-NEXT: shrdl %cl, %eax, %esi ; FALLBACK19-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK19-NEXT: movl 16(%esp,%ebp), %edx ; FALLBACK19-NEXT: movl 44(%esp,%ebp), %edi ; FALLBACK19-NEXT: shrdl %cl, %edi, %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl %eax, 24(%ebp) ; FALLBACK19-NEXT: sarxl %ecx, %edi, %eax ; FALLBACK19-NEXT: movl %eax, 28(%ebp) ; FALLBACK19-NEXT: movl %ebx, 16(%ebp) ; FALLBACK19-NEXT: movl %esi, 20(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 8(%ebp) ; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 12(%ebp) ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: shrdl %cl, %eax, %edx ; FALLBACK19-NEXT: movl %edx, (%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 4(%ebp) ; FALLBACK19-NEXT: addl $92, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx ; FALLBACK19-NEXT: popl %ebp ; FALLBACK19-NEXT: retl ; ; FALLBACK20-LABEL: ashr_32bytes: ; FALLBACK20: # %bb.0: ; FALLBACK20-NEXT: pushl %ebp ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi ; FALLBACK20-NEXT: subl $108, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movl 16(%ecx), %esi ; FALLBACK20-NEXT: movl 20(%ecx), %edi ; FALLBACK20-NEXT: movl 24(%ecx), %ebx ; FALLBACK20-NEXT: movl 28(%ecx), %edx ; FALLBACK20-NEXT: movzbl (%eax), %eax ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shlb $3, %cl ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: sarl $31, %edx ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: andb $28, %al ; FALLBACK20-NEXT: movzbl %al, %edi ; FALLBACK20-NEXT: movl 32(%esp,%edi), %eax ; FALLBACK20-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: movl %ecx, %edx ; FALLBACK20-NEXT: movb %cl, %dh ; FALLBACK20-NEXT: notb %dl ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: orl %eax, %esi ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 44(%esp,%edi), %ebx ; FALLBACK20-NEXT: movl %ebx, %eax ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: movl 48(%esp,%edi), %esi ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: orl %eax, %esi ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 40(%esp,%edi), %esi ; FALLBACK20-NEXT: movl %esi, %eax ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: addl %ebx, %ebx ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %eax, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 52(%esp,%edi), %ebp ; FALLBACK20-NEXT: movl %ebp, %eax ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: movl 56(%esp,%edi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %eax, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: addl %ebp, %ebp ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: orl %eax, %ebp ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: movl 60(%esp,%edi), %eax ; FALLBACK20-NEXT: leal (%eax,%eax), %edi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: orl %ebx, %edi ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movb %dh, %cl ; FALLBACK20-NEXT: sarl %cl, %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movl %eax, 28(%ecx) ; FALLBACK20-NEXT: movl %esi, 4(%ecx) ; FALLBACK20-NEXT: movl %edi, 24(%ecx) ; FALLBACK20-NEXT: movl %ebp, 16(%ecx) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: movl %eax, 20(%ecx) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: movl %eax, 8(%ecx) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: movl %eax, 12(%ecx) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: movl %eax, (%ecx) ; FALLBACK20-NEXT: addl $108, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx ; FALLBACK20-NEXT: popl %ebp ; FALLBACK20-NEXT: retl ; ; FALLBACK21-LABEL: ashr_32bytes: ; FALLBACK21: # %bb.0: ; FALLBACK21-NEXT: pushl %ebp ; FALLBACK21-NEXT: pushl %ebx ; FALLBACK21-NEXT: pushl %edi ; FALLBACK21-NEXT: pushl %esi ; FALLBACK21-NEXT: subl $108, %esp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK21-NEXT: movups (%ecx), %xmm0 ; FALLBACK21-NEXT: movl 16(%ecx), %esi ; FALLBACK21-NEXT: movl 20(%ecx), %edi ; FALLBACK21-NEXT: movl 24(%ecx), %ebx ; FALLBACK21-NEXT: movl 28(%ecx), %edx ; FALLBACK21-NEXT: movzbl (%eax), %eax ; FALLBACK21-NEXT: movl %eax, %ecx ; FALLBACK21-NEXT: shlb $3, %cl ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: sarl $31, %edx ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: andb $28, %al ; FALLBACK21-NEXT: movzbl %al, %ebp ; FALLBACK21-NEXT: movl 48(%esp,%ebp), %esi ; FALLBACK21-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 40(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 56(%esp,%ebp), %ebx ; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK21-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK21-NEXT: movl 32(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl 36(%esp,%ebp), %edi ; FALLBACK21-NEXT: movl %edi, %esi ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK21-NEXT: shrdl %cl, %ebp, %esi ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK21-NEXT: movl %esi, 4(%ebp) ; FALLBACK21-NEXT: movl %ebx, 24(%ebp) ; FALLBACK21-NEXT: shrdl %cl, %edi, %edx ; FALLBACK21-NEXT: sarl %cl, %eax ; FALLBACK21-NEXT: movl %eax, 28(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 16(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 20(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 8(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 12(%ebp) ; FALLBACK21-NEXT: movl %edx, (%ebp) ; FALLBACK21-NEXT: addl $108, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi ; FALLBACK21-NEXT: popl %ebx ; FALLBACK21-NEXT: popl %ebp ; FALLBACK21-NEXT: retl ; ; FALLBACK22-LABEL: ashr_32bytes: ; FALLBACK22: # %bb.0: ; FALLBACK22-NEXT: pushl %ebp ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi ; FALLBACK22-NEXT: subl $108, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movl 16(%ecx), %esi ; FALLBACK22-NEXT: movl 20(%ecx), %edi ; FALLBACK22-NEXT: movl 24(%ecx), %ebx ; FALLBACK22-NEXT: movl 28(%ecx), %edx ; FALLBACK22-NEXT: movzbl (%eax), %ecx ; FALLBACK22-NEXT: movl %ecx, %eax ; FALLBACK22-NEXT: shlb $3, %al ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: sarl $31, %edx ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: andb $28, %cl ; FALLBACK22-NEXT: movzbl %cl, %edi ; FALLBACK22-NEXT: shrxl %eax, 32(%esp,%edi), %ecx ; FALLBACK22-NEXT: movl %eax, %edx ; FALLBACK22-NEXT: notb %dl ; FALLBACK22-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: addl %esi, %esi ; FALLBACK22-NEXT: shlxl %edx, %esi, %esi ; FALLBACK22-NEXT: orl %ecx, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 48(%esp,%edi), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: addl %ecx, %ecx ; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi ; FALLBACK22-NEXT: movl 44(%esp,%edi), %ecx ; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx ; FALLBACK22-NEXT: orl %ebx, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: addl %ecx, %ecx ; FALLBACK22-NEXT: shlxl %edx, %ecx, %esi ; FALLBACK22-NEXT: movl 40(%esp,%edi), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %eax, %ecx, %ebx ; FALLBACK22-NEXT: movl %eax, %ecx ; FALLBACK22-NEXT: orl %ebx, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 56(%esp,%edi), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %ebx ; FALLBACK22-NEXT: shlxl %edx, %ebx, %eax ; FALLBACK22-NEXT: movl 52(%esp,%edi), %ebx ; FALLBACK22-NEXT: shrxl %ecx, %ebx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl %ecx, %eax ; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %ebx, %ebx ; FALLBACK22-NEXT: shlxl %edx, %ebx, %ebx ; FALLBACK22-NEXT: orl %ebp, %ebx ; FALLBACK22-NEXT: shrxl %ecx, %esi, %ecx ; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK22-NEXT: movl 60(%esp,%edi), %edi ; FALLBACK22-NEXT: sarxl %eax, %edi, %eax ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %edx, %edi, %edi ; FALLBACK22-NEXT: orl %ecx, %edi ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: addl %ecx, %ecx ; FALLBACK22-NEXT: shlxl %edx, %ecx, %ecx ; FALLBACK22-NEXT: orl %esi, %ecx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK22-NEXT: movl %eax, 28(%edx) ; FALLBACK22-NEXT: movl %ecx, 4(%edx) ; FALLBACK22-NEXT: movl %edi, 24(%edx) ; FALLBACK22-NEXT: movl %ebx, 16(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 20(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 8(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, 12(%edx) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: movl %eax, (%edx) ; FALLBACK22-NEXT: addl $108, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx ; FALLBACK22-NEXT: popl %ebp ; FALLBACK22-NEXT: retl ; ; FALLBACK23-LABEL: ashr_32bytes: ; FALLBACK23: # %bb.0: ; FALLBACK23-NEXT: pushl %ebp ; FALLBACK23-NEXT: pushl %ebx ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi ; FALLBACK23-NEXT: subl $108, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK23-NEXT: movups (%ecx), %xmm0 ; FALLBACK23-NEXT: movl 16(%ecx), %esi ; FALLBACK23-NEXT: movl 20(%ecx), %edi ; FALLBACK23-NEXT: movl 24(%ecx), %ebx ; FALLBACK23-NEXT: movl 28(%ecx), %edx ; FALLBACK23-NEXT: movzbl (%eax), %eax ; FALLBACK23-NEXT: movl %eax, %ecx ; FALLBACK23-NEXT: shlb $3, %cl ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: sarl $31, %edx ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: andb $28, %al ; FALLBACK23-NEXT: movzbl %al, %ebx ; FALLBACK23-NEXT: movl 48(%esp,%ebx), %esi ; FALLBACK23-NEXT: movl 44(%esp,%ebx), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %esi, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 40(%esp,%ebx), %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 56(%esp,%ebx), %ebp ; FALLBACK23-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK23-NEXT: movl %eax, %edi ; FALLBACK23-NEXT: shrdl %cl, %ebp, %edi ; FALLBACK23-NEXT: shrdl %cl, %eax, %esi ; FALLBACK23-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %ebp ; FALLBACK23-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK23-NEXT: movl 36(%esp,%ebx), %ebx ; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl %ebx, 4(%eax) ; FALLBACK23-NEXT: movl %ebp, 24(%eax) ; FALLBACK23-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK23-NEXT: movl %ebx, 28(%eax) ; FALLBACK23-NEXT: movl %esi, 16(%eax) ; FALLBACK23-NEXT: movl %edi, 20(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK23-NEXT: movl %esi, 8(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK23-NEXT: movl %esi, 12(%eax) ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK23-NEXT: shrdl %cl, %esi, %edx ; FALLBACK23-NEXT: movl %edx, (%eax) ; FALLBACK23-NEXT: addl $108, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi ; FALLBACK23-NEXT: popl %ebx ; FALLBACK23-NEXT: popl %ebp ; FALLBACK23-NEXT: retl ; ; FALLBACK24-LABEL: ashr_32bytes: ; FALLBACK24: # %bb.0: ; FALLBACK24-NEXT: pushl %ebp ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi ; FALLBACK24-NEXT: subl $108, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK24-NEXT: movl 16(%ecx), %esi ; FALLBACK24-NEXT: movl 20(%ecx), %edi ; FALLBACK24-NEXT: movl 24(%ecx), %ebx ; FALLBACK24-NEXT: movl 28(%ecx), %edx ; FALLBACK24-NEXT: movzbl (%eax), %eax ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shlb $3, %cl ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: sarl $31, %edx ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: andb $28, %al ; FALLBACK24-NEXT: movzbl %al, %edi ; FALLBACK24-NEXT: movl 32(%esp,%edi), %eax ; FALLBACK24-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl %ecx, %edx ; FALLBACK24-NEXT: movb %cl, %dh ; FALLBACK24-NEXT: notb %dl ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: orl %eax, %esi ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 44(%esp,%edi), %ebx ; FALLBACK24-NEXT: movl %ebx, %eax ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl 48(%esp,%edi), %esi ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: orl %eax, %esi ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 40(%esp,%edi), %esi ; FALLBACK24-NEXT: movl %esi, %eax ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: addl %ebx, %ebx ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %eax, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 52(%esp,%edi), %ebp ; FALLBACK24-NEXT: movl %ebp, %eax ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl 56(%esp,%edi), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %eax, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: addl %ebp, %ebp ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: orl %eax, %ebp ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: movl 60(%esp,%edi), %eax ; FALLBACK24-NEXT: leal (%eax,%eax), %edi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: orl %ebx, %edi ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movb %dh, %cl ; FALLBACK24-NEXT: sarl %cl, %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: movl %eax, 28(%ecx) ; FALLBACK24-NEXT: movl %esi, 4(%ecx) ; FALLBACK24-NEXT: movl %edi, 24(%ecx) ; FALLBACK24-NEXT: movl %ebp, 16(%ecx) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: movl %eax, 20(%ecx) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: movl %eax, 8(%ecx) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: movl %eax, 12(%ecx) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: movl %eax, (%ecx) ; FALLBACK24-NEXT: addl $108, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx ; FALLBACK24-NEXT: popl %ebp ; FALLBACK24-NEXT: retl ; ; FALLBACK25-LABEL: ashr_32bytes: ; FALLBACK25: # %bb.0: ; FALLBACK25-NEXT: pushl %ebp ; FALLBACK25-NEXT: pushl %ebx ; FALLBACK25-NEXT: pushl %edi ; FALLBACK25-NEXT: pushl %esi ; FALLBACK25-NEXT: subl $108, %esp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK25-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK25-NEXT: movl 16(%ecx), %esi ; FALLBACK25-NEXT: movl 20(%ecx), %edi ; FALLBACK25-NEXT: movl 24(%ecx), %ebx ; FALLBACK25-NEXT: movl 28(%ecx), %edx ; FALLBACK25-NEXT: movzbl (%eax), %eax ; FALLBACK25-NEXT: movl %eax, %ecx ; FALLBACK25-NEXT: shlb $3, %cl ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: sarl $31, %edx ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: andb $28, %al ; FALLBACK25-NEXT: movzbl %al, %ebp ; FALLBACK25-NEXT: movl 48(%esp,%ebp), %esi ; FALLBACK25-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 40(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 56(%esp,%ebp), %ebx ; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK25-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK25-NEXT: movl 32(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl 36(%esp,%ebp), %edi ; FALLBACK25-NEXT: movl %edi, %esi ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK25-NEXT: shrdl %cl, %ebp, %esi ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK25-NEXT: movl %esi, 4(%ebp) ; FALLBACK25-NEXT: movl %ebx, 24(%ebp) ; FALLBACK25-NEXT: shrdl %cl, %edi, %edx ; FALLBACK25-NEXT: sarl %cl, %eax ; FALLBACK25-NEXT: movl %eax, 28(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 16(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 20(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 8(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 12(%ebp) ; FALLBACK25-NEXT: movl %edx, (%ebp) ; FALLBACK25-NEXT: addl $108, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi ; FALLBACK25-NEXT: popl %ebx ; FALLBACK25-NEXT: popl %ebp ; FALLBACK25-NEXT: retl ; ; FALLBACK26-LABEL: ashr_32bytes: ; FALLBACK26: # %bb.0: ; FALLBACK26-NEXT: pushl %ebp ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi ; FALLBACK26-NEXT: subl $108, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK26-NEXT: movl 16(%ecx), %esi ; FALLBACK26-NEXT: movl 20(%ecx), %edi ; FALLBACK26-NEXT: movl 24(%ecx), %ebx ; FALLBACK26-NEXT: movl 28(%ecx), %edx ; FALLBACK26-NEXT: movzbl (%eax), %ecx ; FALLBACK26-NEXT: movl %ecx, %eax ; FALLBACK26-NEXT: shlb $3, %al ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: sarl $31, %edx ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: andb $28, %cl ; FALLBACK26-NEXT: movzbl %cl, %edi ; FALLBACK26-NEXT: shrxl %eax, 32(%esp,%edi), %ecx ; FALLBACK26-NEXT: movl %eax, %edx ; FALLBACK26-NEXT: notb %dl ; FALLBACK26-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: addl %esi, %esi ; FALLBACK26-NEXT: shlxl %edx, %esi, %esi ; FALLBACK26-NEXT: orl %ecx, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 48(%esp,%edi), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: addl %ecx, %ecx ; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi ; FALLBACK26-NEXT: movl 44(%esp,%edi), %ecx ; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx ; FALLBACK26-NEXT: orl %ebx, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: addl %ecx, %ecx ; FALLBACK26-NEXT: shlxl %edx, %ecx, %esi ; FALLBACK26-NEXT: movl 40(%esp,%edi), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %eax, %ecx, %ebx ; FALLBACK26-NEXT: movl %eax, %ecx ; FALLBACK26-NEXT: orl %ebx, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 56(%esp,%edi), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %ebx ; FALLBACK26-NEXT: shlxl %edx, %ebx, %eax ; FALLBACK26-NEXT: movl 52(%esp,%edi), %ebx ; FALLBACK26-NEXT: shrxl %ecx, %ebx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl %ecx, %eax ; FALLBACK26-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %ebx, %ebx ; FALLBACK26-NEXT: shlxl %edx, %ebx, %ebx ; FALLBACK26-NEXT: orl %ebp, %ebx ; FALLBACK26-NEXT: shrxl %ecx, %esi, %ecx ; FALLBACK26-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK26-NEXT: movl 60(%esp,%edi), %edi ; FALLBACK26-NEXT: sarxl %eax, %edi, %eax ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %edx, %edi, %edi ; FALLBACK26-NEXT: orl %ecx, %edi ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: addl %ecx, %ecx ; FALLBACK26-NEXT: shlxl %edx, %ecx, %ecx ; FALLBACK26-NEXT: orl %esi, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK26-NEXT: movl %eax, 28(%edx) ; FALLBACK26-NEXT: movl %ecx, 4(%edx) ; FALLBACK26-NEXT: movl %edi, 24(%edx) ; FALLBACK26-NEXT: movl %ebx, 16(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 20(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 12(%edx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, (%edx) ; FALLBACK26-NEXT: addl $108, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx ; FALLBACK26-NEXT: popl %ebp ; FALLBACK26-NEXT: retl ; ; FALLBACK27-LABEL: ashr_32bytes: ; FALLBACK27: # %bb.0: ; FALLBACK27-NEXT: pushl %ebp ; FALLBACK27-NEXT: pushl %ebx ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi ; FALLBACK27-NEXT: subl $108, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK27-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK27-NEXT: movl 16(%ecx), %esi ; FALLBACK27-NEXT: movl 20(%ecx), %edi ; FALLBACK27-NEXT: movl 24(%ecx), %ebx ; FALLBACK27-NEXT: movl 28(%ecx), %edx ; FALLBACK27-NEXT: movzbl (%eax), %eax ; FALLBACK27-NEXT: movl %eax, %ecx ; FALLBACK27-NEXT: shlb $3, %cl ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: sarl $31, %edx ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: andb $28, %al ; FALLBACK27-NEXT: movzbl %al, %ebx ; FALLBACK27-NEXT: movl 48(%esp,%ebx), %esi ; FALLBACK27-NEXT: movl 44(%esp,%ebx), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %esi, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 40(%esp,%ebx), %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 56(%esp,%ebx), %ebp ; FALLBACK27-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK27-NEXT: movl %eax, %edi ; FALLBACK27-NEXT: shrdl %cl, %ebp, %edi ; FALLBACK27-NEXT: shrdl %cl, %eax, %esi ; FALLBACK27-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %ebp ; FALLBACK27-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK27-NEXT: movl 36(%esp,%ebx), %ebx ; FALLBACK27-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl %ebx, 4(%eax) ; FALLBACK27-NEXT: movl %ebp, 24(%eax) ; FALLBACK27-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK27-NEXT: movl %ebx, 28(%eax) ; FALLBACK27-NEXT: movl %esi, 16(%eax) ; FALLBACK27-NEXT: movl %edi, 20(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK27-NEXT: movl %esi, 8(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK27-NEXT: movl %esi, 12(%eax) ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK27-NEXT: shrdl %cl, %esi, %edx ; FALLBACK27-NEXT: movl %edx, (%eax) ; FALLBACK27-NEXT: addl $108, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi ; FALLBACK27-NEXT: popl %ebx ; FALLBACK27-NEXT: popl %ebp ; FALLBACK27-NEXT: retl ; ; FALLBACK28-LABEL: ashr_32bytes: ; FALLBACK28: # %bb.0: ; FALLBACK28-NEXT: pushl %ebp ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi ; FALLBACK28-NEXT: subl $108, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK28-NEXT: movl 16(%ecx), %esi ; FALLBACK28-NEXT: movl 20(%ecx), %edi ; FALLBACK28-NEXT: movl 24(%ecx), %ebx ; FALLBACK28-NEXT: movl 28(%ecx), %edx ; FALLBACK28-NEXT: movzbl (%eax), %eax ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shlb $3, %cl ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: sarl $31, %edx ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: andb $28, %al ; FALLBACK28-NEXT: movzbl %al, %edi ; FALLBACK28-NEXT: movl 32(%esp,%edi), %eax ; FALLBACK28-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl %ecx, %edx ; FALLBACK28-NEXT: movb %cl, %dh ; FALLBACK28-NEXT: notb %dl ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: orl %eax, %esi ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 44(%esp,%edi), %ebx ; FALLBACK28-NEXT: movl %ebx, %eax ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl 48(%esp,%edi), %esi ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: orl %eax, %esi ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 40(%esp,%edi), %esi ; FALLBACK28-NEXT: movl %esi, %eax ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: addl %ebx, %ebx ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %eax, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 52(%esp,%edi), %ebp ; FALLBACK28-NEXT: movl %ebp, %eax ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl 56(%esp,%edi), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %eax, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: addl %ebp, %ebp ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: orl %eax, %ebp ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: movl 60(%esp,%edi), %eax ; FALLBACK28-NEXT: leal (%eax,%eax), %edi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: orl %ebx, %edi ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movb %dh, %cl ; FALLBACK28-NEXT: sarl %cl, %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: movl %eax, 28(%ecx) ; FALLBACK28-NEXT: movl %esi, 4(%ecx) ; FALLBACK28-NEXT: movl %edi, 24(%ecx) ; FALLBACK28-NEXT: movl %ebp, 16(%ecx) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: movl %eax, 20(%ecx) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: movl %eax, 8(%ecx) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: movl %eax, 12(%ecx) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: movl %eax, (%ecx) ; FALLBACK28-NEXT: addl $108, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx ; FALLBACK28-NEXT: popl %ebp ; FALLBACK28-NEXT: retl ; ; FALLBACK29-LABEL: ashr_32bytes: ; FALLBACK29: # %bb.0: ; FALLBACK29-NEXT: pushl %ebp ; FALLBACK29-NEXT: pushl %ebx ; FALLBACK29-NEXT: pushl %edi ; FALLBACK29-NEXT: pushl %esi ; FALLBACK29-NEXT: subl $108, %esp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK29-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK29-NEXT: movl 16(%ecx), %esi ; FALLBACK29-NEXT: movl 20(%ecx), %edi ; FALLBACK29-NEXT: movl 24(%ecx), %ebx ; FALLBACK29-NEXT: movl 28(%ecx), %edx ; FALLBACK29-NEXT: movzbl (%eax), %eax ; FALLBACK29-NEXT: movl %eax, %ecx ; FALLBACK29-NEXT: shlb $3, %cl ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: sarl $31, %edx ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: andb $28, %al ; FALLBACK29-NEXT: movzbl %al, %ebp ; FALLBACK29-NEXT: movl 48(%esp,%ebp), %esi ; FALLBACK29-NEXT: movl 44(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 40(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 56(%esp,%ebp), %ebx ; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK29-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK29-NEXT: movl 32(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl 36(%esp,%ebp), %edi ; FALLBACK29-NEXT: movl %edi, %esi ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK29-NEXT: shrdl %cl, %ebp, %esi ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK29-NEXT: movl %esi, 4(%ebp) ; FALLBACK29-NEXT: movl %ebx, 24(%ebp) ; FALLBACK29-NEXT: shrdl %cl, %edi, %edx ; FALLBACK29-NEXT: sarl %cl, %eax ; FALLBACK29-NEXT: movl %eax, 28(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 16(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 20(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 8(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 12(%ebp) ; FALLBACK29-NEXT: movl %edx, (%ebp) ; FALLBACK29-NEXT: addl $108, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi ; FALLBACK29-NEXT: popl %ebx ; FALLBACK29-NEXT: popl %ebp ; FALLBACK29-NEXT: retl ; ; FALLBACK30-LABEL: ashr_32bytes: ; FALLBACK30: # %bb.0: ; FALLBACK30-NEXT: pushl %ebp ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi ; FALLBACK30-NEXT: subl $108, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK30-NEXT: movl 16(%ecx), %esi ; FALLBACK30-NEXT: movl 20(%ecx), %edi ; FALLBACK30-NEXT: movl 24(%ecx), %ebx ; FALLBACK30-NEXT: movl 28(%ecx), %edx ; FALLBACK30-NEXT: movzbl (%eax), %ecx ; FALLBACK30-NEXT: movl %ecx, %eax ; FALLBACK30-NEXT: shlb $3, %al ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: sarl $31, %edx ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: andb $28, %cl ; FALLBACK30-NEXT: movzbl %cl, %edi ; FALLBACK30-NEXT: shrxl %eax, 32(%esp,%edi), %ecx ; FALLBACK30-NEXT: movl %eax, %edx ; FALLBACK30-NEXT: notb %dl ; FALLBACK30-NEXT: movl 36(%esp,%edi), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: addl %esi, %esi ; FALLBACK30-NEXT: shlxl %edx, %esi, %esi ; FALLBACK30-NEXT: orl %ecx, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 48(%esp,%edi), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: addl %ecx, %ecx ; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi ; FALLBACK30-NEXT: movl 44(%esp,%edi), %ecx ; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx ; FALLBACK30-NEXT: orl %ebx, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: addl %ecx, %ecx ; FALLBACK30-NEXT: shlxl %edx, %ecx, %esi ; FALLBACK30-NEXT: movl 40(%esp,%edi), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %eax, %ecx, %ebx ; FALLBACK30-NEXT: movl %eax, %ecx ; FALLBACK30-NEXT: orl %ebx, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 56(%esp,%edi), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %ebx ; FALLBACK30-NEXT: shlxl %edx, %ebx, %eax ; FALLBACK30-NEXT: movl 52(%esp,%edi), %ebx ; FALLBACK30-NEXT: shrxl %ecx, %ebx, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl %ecx, %eax ; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %ebx, %ebx ; FALLBACK30-NEXT: shlxl %edx, %ebx, %ebx ; FALLBACK30-NEXT: orl %ebp, %ebx ; FALLBACK30-NEXT: shrxl %ecx, %esi, %ecx ; FALLBACK30-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; FALLBACK30-NEXT: movl 60(%esp,%edi), %edi ; FALLBACK30-NEXT: sarxl %eax, %edi, %eax ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %edx, %edi, %edi ; FALLBACK30-NEXT: orl %ecx, %edi ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: addl %ecx, %ecx ; FALLBACK30-NEXT: shlxl %edx, %ecx, %ecx ; FALLBACK30-NEXT: orl %esi, %ecx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %edx ; FALLBACK30-NEXT: movl %eax, 28(%edx) ; FALLBACK30-NEXT: movl %ecx, 4(%edx) ; FALLBACK30-NEXT: movl %edi, 24(%edx) ; FALLBACK30-NEXT: movl %ebx, 16(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 20(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 12(%edx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, (%edx) ; FALLBACK30-NEXT: addl $108, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx ; FALLBACK30-NEXT: popl %ebp ; FALLBACK30-NEXT: retl ; ; FALLBACK31-LABEL: ashr_32bytes: ; FALLBACK31: # %bb.0: ; FALLBACK31-NEXT: pushl %ebp ; FALLBACK31-NEXT: pushl %ebx ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi ; FALLBACK31-NEXT: subl $108, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK31-NEXT: vmovups (%ecx), %xmm0 ; FALLBACK31-NEXT: movl 16(%ecx), %esi ; FALLBACK31-NEXT: movl 20(%ecx), %edi ; FALLBACK31-NEXT: movl 24(%ecx), %ebx ; FALLBACK31-NEXT: movl 28(%ecx), %edx ; FALLBACK31-NEXT: movzbl (%eax), %eax ; FALLBACK31-NEXT: movl %eax, %ecx ; FALLBACK31-NEXT: shlb $3, %cl ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: sarl $31, %edx ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: andb $28, %al ; FALLBACK31-NEXT: movzbl %al, %ebx ; FALLBACK31-NEXT: movl 48(%esp,%ebx), %esi ; FALLBACK31-NEXT: movl 44(%esp,%ebx), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %esi, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 40(%esp,%ebx), %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 56(%esp,%ebx), %ebp ; FALLBACK31-NEXT: movl 52(%esp,%ebx), %eax ; FALLBACK31-NEXT: movl %eax, %edi ; FALLBACK31-NEXT: shrdl %cl, %ebp, %edi ; FALLBACK31-NEXT: shrdl %cl, %eax, %esi ; FALLBACK31-NEXT: movl 60(%esp,%ebx), %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %ebp ; FALLBACK31-NEXT: movl 32(%esp,%ebx), %edx ; FALLBACK31-NEXT: movl 36(%esp,%ebx), %ebx ; FALLBACK31-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl %ebx, 4(%eax) ; FALLBACK31-NEXT: movl %ebp, 24(%eax) ; FALLBACK31-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; FALLBACK31-NEXT: movl %ebx, 28(%eax) ; FALLBACK31-NEXT: movl %esi, 16(%eax) ; FALLBACK31-NEXT: movl %edi, 20(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK31-NEXT: movl %esi, 8(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK31-NEXT: movl %esi, 12(%eax) ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; FALLBACK31-NEXT: shrdl %cl, %esi, %edx ; FALLBACK31-NEXT: movl %edx, (%eax) ; FALLBACK31-NEXT: addl $108, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi ; FALLBACK31-NEXT: popl %ebx ; FALLBACK31-NEXT: popl %ebp ; FALLBACK31-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %byteOff = load i256, ptr %byteOff.ptr, align 1 %bitOff = shl i256 %byteOff, 3 %res = ashr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @ashr_32bytes_dwordOff(ptr %src.ptr, ptr %dwordOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: ashr_32bytes_dwordOff: ; FALLBACK0: # %bb.0: ; FALLBACK0-NEXT: pushq %rbx ; FALLBACK0-NEXT: movq (%rdi), %rcx ; FALLBACK0-NEXT: movq 8(%rdi), %r8 ; FALLBACK0-NEXT: movq 16(%rdi), %r9 ; FALLBACK0-NEXT: movq 24(%rdi), %rdi ; FALLBACK0-NEXT: movzbl (%rsi), %esi ; FALLBACK0-NEXT: movl %esi, %eax ; FALLBACK0-NEXT: shlb $5, %al ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: sarq $63, %rdi ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: andb $6, %sil ; FALLBACK0-NEXT: movzbl %sil, %r9d ; FALLBACK0-NEXT: movq -64(%rsp,%r9,4), %r10 ; FALLBACK0-NEXT: movq -56(%rsp,%r9,4), %rdi ; FALLBACK0-NEXT: movq %rdi, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq -48(%rsp,%r9,4), %rbx ; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r8 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 ; FALLBACK0-NEXT: orq %r11, %r8 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r10 ; FALLBACK0-NEXT: addq %rdi, %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rdi ; FALLBACK0-NEXT: orq %r10, %rdi ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx ; FALLBACK0-NEXT: movq -40(%rsp,%r9,4), %r9 ; FALLBACK0-NEXT: leaq (%r9,%r9), %r10 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: orq %rbx, %r10 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: sarq %cl, %r9 ; FALLBACK0-NEXT: movq %r9, 24(%rdx) ; FALLBACK0-NEXT: movq %r10, 16(%rdx) ; FALLBACK0-NEXT: movq %rdi, (%rdx) ; FALLBACK0-NEXT: movq %r8, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: ashr_32bytes_dwordOff: ; FALLBACK1: # %bb.0: ; FALLBACK1-NEXT: movq (%rdi), %rax ; FALLBACK1-NEXT: movq 8(%rdi), %r8 ; FALLBACK1-NEXT: movq 16(%rdi), %r9 ; FALLBACK1-NEXT: movq 24(%rdi), %rdi ; FALLBACK1-NEXT: movzbl (%rsi), %esi ; FALLBACK1-NEXT: movl %esi, %ecx ; FALLBACK1-NEXT: shlb $5, %cl ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: sarq $63, %rdi ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: andb $6, %sil ; FALLBACK1-NEXT: movzbl %sil, %eax ; FALLBACK1-NEXT: movq -56(%rsp,%rax,4), %rsi ; FALLBACK1-NEXT: movq -72(%rsp,%rax,4), %rdi ; FALLBACK1-NEXT: movq -64(%rsp,%rax,4), %r8 ; FALLBACK1-NEXT: movq %r8, %r9 ; FALLBACK1-NEXT: shrdq %cl, %rsi, %r9 ; FALLBACK1-NEXT: movq -48(%rsp,%rax,4), %rax ; FALLBACK1-NEXT: shrdq %cl, %rax, %rsi ; FALLBACK1-NEXT: shrdq %cl, %r8, %rdi ; FALLBACK1-NEXT: sarq %cl, %rax ; FALLBACK1-NEXT: movq %rsi, 16(%rdx) ; FALLBACK1-NEXT: movq %rax, 24(%rdx) ; FALLBACK1-NEXT: movq %rdi, (%rdx) ; FALLBACK1-NEXT: movq %r9, 8(%rdx) ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: ashr_32bytes_dwordOff: ; FALLBACK2: # %bb.0: ; FALLBACK2-NEXT: movq (%rdi), %rcx ; FALLBACK2-NEXT: movq 8(%rdi), %r8 ; FALLBACK2-NEXT: movq 16(%rdi), %r9 ; FALLBACK2-NEXT: movq 24(%rdi), %rdi ; FALLBACK2-NEXT: movzbl (%rsi), %esi ; FALLBACK2-NEXT: movl %esi, %eax ; FALLBACK2-NEXT: shlb $5, %al ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: sarq $63, %rdi ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: andb $6, %sil ; FALLBACK2-NEXT: movzbl %sil, %ecx ; FALLBACK2-NEXT: movq -64(%rsp,%rcx,4), %rsi ; FALLBACK2-NEXT: movq -56(%rsp,%rcx,4), %rdi ; FALLBACK2-NEXT: shrxq %rax, %rsi, %r8 ; FALLBACK2-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %r9 ; FALLBACK2-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK2-NEXT: movq -48(%rsp,%rcx,4), %rcx ; FALLBACK2-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK2-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK2-NEXT: notb %al ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK2-NEXT: orq %r8, %rdi ; FALLBACK2-NEXT: addq %rsi, %rsi ; FALLBACK2-NEXT: shlxq %rax, %rsi, %rsi ; FALLBACK2-NEXT: orq %r9, %rsi ; FALLBACK2-NEXT: addq %rcx, %rcx ; FALLBACK2-NEXT: shlxq %rax, %rcx, %rax ; FALLBACK2-NEXT: orq %r10, %rax ; FALLBACK2-NEXT: movq %r11, 24(%rdx) ; FALLBACK2-NEXT: movq %rax, 16(%rdx) ; FALLBACK2-NEXT: movq %rsi, (%rdx) ; FALLBACK2-NEXT: movq %rdi, 8(%rdx) ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_32bytes_dwordOff: ; FALLBACK3: # %bb.0: ; FALLBACK3-NEXT: movq (%rdi), %rax ; FALLBACK3-NEXT: movq 8(%rdi), %r8 ; FALLBACK3-NEXT: movq 16(%rdi), %r9 ; FALLBACK3-NEXT: movq 24(%rdi), %rdi ; FALLBACK3-NEXT: movzbl (%rsi), %esi ; FALLBACK3-NEXT: movl %esi, %ecx ; FALLBACK3-NEXT: shlb $5, %cl ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: sarq $63, %rdi ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: andb $6, %sil ; FALLBACK3-NEXT: movzbl %sil, %eax ; FALLBACK3-NEXT: movq -56(%rsp,%rax,4), %rsi ; FALLBACK3-NEXT: movq -72(%rsp,%rax,4), %rdi ; FALLBACK3-NEXT: movq -64(%rsp,%rax,4), %r8 ; FALLBACK3-NEXT: movq %r8, %r9 ; FALLBACK3-NEXT: shrdq %cl, %rsi, %r9 ; FALLBACK3-NEXT: movq -48(%rsp,%rax,4), %rax ; FALLBACK3-NEXT: shrdq %cl, %rax, %rsi ; FALLBACK3-NEXT: shrdq %cl, %r8, %rdi ; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax ; FALLBACK3-NEXT: movq %rsi, 16(%rdx) ; FALLBACK3-NEXT: movq %rax, 24(%rdx) ; FALLBACK3-NEXT: movq %rdi, (%rdx) ; FALLBACK3-NEXT: movq %r9, 8(%rdx) ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: ashr_32bytes_dwordOff: ; FALLBACK4: # %bb.0: ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movq 16(%rdi), %rcx ; FALLBACK4-NEXT: movq 24(%rdi), %rdi ; FALLBACK4-NEXT: movzbl (%rsi), %esi ; FALLBACK4-NEXT: movl %esi, %eax ; FALLBACK4-NEXT: shlb $5, %al ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: sarq $63, %rdi ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: andb $6, %sil ; FALLBACK4-NEXT: movzbl %sil, %r9d ; FALLBACK4-NEXT: movq -64(%rsp,%r9,4), %r10 ; FALLBACK4-NEXT: movq -56(%rsp,%r9,4), %r8 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi ; FALLBACK4-NEXT: orq %r10, %rdi ; FALLBACK4-NEXT: movq -48(%rsp,%r9,4), %r10 ; FALLBACK4-NEXT: movq %r10, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 ; FALLBACK4-NEXT: movq -40(%rsp,%r9,4), %r9 ; FALLBACK4-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rbx ; FALLBACK4-NEXT: orq %r11, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 ; FALLBACK4-NEXT: addq %r10, %r10 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: orq %r8, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: sarq %cl, %r9 ; FALLBACK4-NEXT: movq %r9, 24(%rdx) ; FALLBACK4-NEXT: movq %r10, 8(%rdx) ; FALLBACK4-NEXT: movq %rbx, 16(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: ashr_32bytes_dwordOff: ; FALLBACK5: # %bb.0: ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movq 16(%rdi), %rax ; FALLBACK5-NEXT: movq 24(%rdi), %rdi ; FALLBACK5-NEXT: movzbl (%rsi), %esi ; FALLBACK5-NEXT: movl %esi, %ecx ; FALLBACK5-NEXT: shlb $5, %cl ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: sarq $63, %rdi ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: andb $6, %sil ; FALLBACK5-NEXT: movzbl %sil, %eax ; FALLBACK5-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK5-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK5-NEXT: movq %rdi, %r8 ; FALLBACK5-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK5-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK5-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK5-NEXT: movq %rax, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK5-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK5-NEXT: sarq %cl, %rsi ; FALLBACK5-NEXT: movq %r10, 8(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) ; FALLBACK5-NEXT: movq %r9, (%rdx) ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: ashr_32bytes_dwordOff: ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movq 16(%rdi), %rcx ; FALLBACK6-NEXT: movq 24(%rdi), %rdi ; FALLBACK6-NEXT: movzbl (%rsi), %esi ; FALLBACK6-NEXT: movl %esi, %eax ; FALLBACK6-NEXT: shlb $5, %al ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: sarq $63, %rdi ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: andb $6, %sil ; FALLBACK6-NEXT: movzbl %sil, %ecx ; FALLBACK6-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi ; FALLBACK6-NEXT: movq -64(%rsp,%rcx,4), %rdi ; FALLBACK6-NEXT: movq -56(%rsp,%rcx,4), %r8 ; FALLBACK6-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK6-NEXT: movq -48(%rsp,%rcx,4), %rcx ; FALLBACK6-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK6-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK6-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK6-NEXT: notb %al ; FALLBACK6-NEXT: addq %rdi, %rdi ; FALLBACK6-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK6-NEXT: orq %rsi, %rdi ; FALLBACK6-NEXT: addq %rcx, %rcx ; FALLBACK6-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK6-NEXT: orq %r9, %rcx ; FALLBACK6-NEXT: addq %r8, %r8 ; FALLBACK6-NEXT: shlxq %rax, %r8, %rax ; FALLBACK6-NEXT: orq %r10, %rax ; FALLBACK6-NEXT: movq %r11, 24(%rdx) ; FALLBACK6-NEXT: movq %rax, 8(%rdx) ; FALLBACK6-NEXT: movq %rcx, 16(%rdx) ; FALLBACK6-NEXT: movq %rdi, (%rdx) ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_32bytes_dwordOff: ; FALLBACK7: # %bb.0: ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movq 16(%rdi), %rax ; FALLBACK7-NEXT: movq 24(%rdi), %rdi ; FALLBACK7-NEXT: movzbl (%rsi), %esi ; FALLBACK7-NEXT: movl %esi, %ecx ; FALLBACK7-NEXT: shlb $5, %cl ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: sarq $63, %rdi ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: andb $6, %sil ; FALLBACK7-NEXT: movzbl %sil, %eax ; FALLBACK7-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK7-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK7-NEXT: movq %rdi, %r8 ; FALLBACK7-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK7-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK7-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK7-NEXT: movq %rax, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK7-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK7-NEXT: sarxq %rcx, %rsi, %rax ; FALLBACK7-NEXT: movq %r10, 8(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rax, 24(%rdx) ; FALLBACK7-NEXT: movq %r9, (%rdx) ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: ashr_32bytes_dwordOff: ; FALLBACK8: # %bb.0: ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK8-NEXT: movq 16(%rdi), %rcx ; FALLBACK8-NEXT: movq 24(%rdi), %rdi ; FALLBACK8-NEXT: movzbl (%rsi), %esi ; FALLBACK8-NEXT: movl %esi, %eax ; FALLBACK8-NEXT: shlb $5, %al ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: sarq $63, %rdi ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: andb $6, %sil ; FALLBACK8-NEXT: movzbl %sil, %r9d ; FALLBACK8-NEXT: movq -64(%rsp,%r9,4), %r10 ; FALLBACK8-NEXT: movq -56(%rsp,%r9,4), %r8 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi ; FALLBACK8-NEXT: orq %r10, %rdi ; FALLBACK8-NEXT: movq -48(%rsp,%r9,4), %r10 ; FALLBACK8-NEXT: movq %r10, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 ; FALLBACK8-NEXT: movq -40(%rsp,%r9,4), %r9 ; FALLBACK8-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rbx ; FALLBACK8-NEXT: orq %r11, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 ; FALLBACK8-NEXT: addq %r10, %r10 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: orq %r8, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: sarq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, 24(%rdx) ; FALLBACK8-NEXT: movq %r10, 8(%rdx) ; FALLBACK8-NEXT: movq %rbx, 16(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: ashr_32bytes_dwordOff: ; FALLBACK9: # %bb.0: ; FALLBACK9-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK9-NEXT: movq 16(%rdi), %rax ; FALLBACK9-NEXT: movq 24(%rdi), %rdi ; FALLBACK9-NEXT: movzbl (%rsi), %esi ; FALLBACK9-NEXT: movl %esi, %ecx ; FALLBACK9-NEXT: shlb $5, %cl ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: sarq $63, %rdi ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: andb $6, %sil ; FALLBACK9-NEXT: movzbl %sil, %eax ; FALLBACK9-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK9-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK9-NEXT: movq %rdi, %r8 ; FALLBACK9-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK9-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK9-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK9-NEXT: movq %rax, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK9-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK9-NEXT: sarq %cl, %rsi ; FALLBACK9-NEXT: movq %r10, 8(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) ; FALLBACK9-NEXT: movq %r9, (%rdx) ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: ashr_32bytes_dwordOff: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK10-NEXT: movq 16(%rdi), %rcx ; FALLBACK10-NEXT: movq 24(%rdi), %rdi ; FALLBACK10-NEXT: movzbl (%rsi), %esi ; FALLBACK10-NEXT: movl %esi, %eax ; FALLBACK10-NEXT: shlb $5, %al ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: sarq $63, %rdi ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: andb $6, %sil ; FALLBACK10-NEXT: movzbl %sil, %ecx ; FALLBACK10-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi ; FALLBACK10-NEXT: movq -64(%rsp,%rcx,4), %rdi ; FALLBACK10-NEXT: movq -56(%rsp,%rcx,4), %r8 ; FALLBACK10-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK10-NEXT: movq -48(%rsp,%rcx,4), %rcx ; FALLBACK10-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK10-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK10-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK10-NEXT: notb %al ; FALLBACK10-NEXT: addq %rdi, %rdi ; FALLBACK10-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK10-NEXT: orq %rsi, %rdi ; FALLBACK10-NEXT: addq %rcx, %rcx ; FALLBACK10-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK10-NEXT: orq %r9, %rcx ; FALLBACK10-NEXT: addq %r8, %r8 ; FALLBACK10-NEXT: shlxq %rax, %r8, %rax ; FALLBACK10-NEXT: orq %r10, %rax ; FALLBACK10-NEXT: movq %r11, 24(%rdx) ; FALLBACK10-NEXT: movq %rax, 8(%rdx) ; FALLBACK10-NEXT: movq %rcx, 16(%rdx) ; FALLBACK10-NEXT: movq %rdi, (%rdx) ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_32bytes_dwordOff: ; FALLBACK11: # %bb.0: ; FALLBACK11-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK11-NEXT: movq 16(%rdi), %rax ; FALLBACK11-NEXT: movq 24(%rdi), %rdi ; FALLBACK11-NEXT: movzbl (%rsi), %esi ; FALLBACK11-NEXT: movl %esi, %ecx ; FALLBACK11-NEXT: shlb $5, %cl ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: sarq $63, %rdi ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: andb $6, %sil ; FALLBACK11-NEXT: movzbl %sil, %eax ; FALLBACK11-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK11-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK11-NEXT: movq %rdi, %r8 ; FALLBACK11-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK11-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK11-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK11-NEXT: movq %rax, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK11-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK11-NEXT: sarxq %rcx, %rsi, %rax ; FALLBACK11-NEXT: movq %r10, 8(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) ; FALLBACK11-NEXT: movq %rax, 24(%rdx) ; FALLBACK11-NEXT: movq %r9, (%rdx) ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: ashr_32bytes_dwordOff: ; FALLBACK12: # %bb.0: ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK12-NEXT: movq 16(%rdi), %rcx ; FALLBACK12-NEXT: movq 24(%rdi), %rdi ; FALLBACK12-NEXT: movzbl (%rsi), %esi ; FALLBACK12-NEXT: movl %esi, %eax ; FALLBACK12-NEXT: shlb $5, %al ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: sarq $63, %rdi ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: andb $6, %sil ; FALLBACK12-NEXT: movzbl %sil, %r9d ; FALLBACK12-NEXT: movq -64(%rsp,%r9,4), %r10 ; FALLBACK12-NEXT: movq -56(%rsp,%r9,4), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi ; FALLBACK12-NEXT: orq %r10, %rdi ; FALLBACK12-NEXT: movq -48(%rsp,%r9,4), %r10 ; FALLBACK12-NEXT: movq %r10, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 ; FALLBACK12-NEXT: movq -40(%rsp,%r9,4), %r9 ; FALLBACK12-NEXT: leaq (%r9,%r9), %rbx ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r11, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 ; FALLBACK12-NEXT: addq %r10, %r10 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: orq %r8, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: sarq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 24(%rdx) ; FALLBACK12-NEXT: movq %r10, 8(%rdx) ; FALLBACK12-NEXT: movq %rbx, 16(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: ashr_32bytes_dwordOff: ; FALLBACK13: # %bb.0: ; FALLBACK13-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK13-NEXT: movq 16(%rdi), %rax ; FALLBACK13-NEXT: movq 24(%rdi), %rdi ; FALLBACK13-NEXT: movzbl (%rsi), %esi ; FALLBACK13-NEXT: movl %esi, %ecx ; FALLBACK13-NEXT: shlb $5, %cl ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: sarq $63, %rdi ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: andb $6, %sil ; FALLBACK13-NEXT: movzbl %sil, %eax ; FALLBACK13-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK13-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK13-NEXT: movq %rdi, %r8 ; FALLBACK13-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK13-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK13-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK13-NEXT: movq %rax, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK13-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK13-NEXT: sarq %cl, %rsi ; FALLBACK13-NEXT: movq %r10, 8(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) ; FALLBACK13-NEXT: movq %r9, (%rdx) ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: ashr_32bytes_dwordOff: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK14-NEXT: movq 16(%rdi), %rcx ; FALLBACK14-NEXT: movq 24(%rdi), %rdi ; FALLBACK14-NEXT: movzbl (%rsi), %esi ; FALLBACK14-NEXT: movl %esi, %eax ; FALLBACK14-NEXT: shlb $5, %al ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: sarq $63, %rdi ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: andb $6, %sil ; FALLBACK14-NEXT: movzbl %sil, %ecx ; FALLBACK14-NEXT: shrxq %rax, -72(%rsp,%rcx,4), %rsi ; FALLBACK14-NEXT: movq -64(%rsp,%rcx,4), %rdi ; FALLBACK14-NEXT: movq -56(%rsp,%rcx,4), %r8 ; FALLBACK14-NEXT: shrxq %rax, %r8, %r9 ; FALLBACK14-NEXT: movq -48(%rsp,%rcx,4), %rcx ; FALLBACK14-NEXT: shrxq %rax, %rdi, %r10 ; FALLBACK14-NEXT: sarxq %rax, %rcx, %r11 ; FALLBACK14-NEXT: # kill: def $al killed $al killed $rax def $rax ; FALLBACK14-NEXT: notb %al ; FALLBACK14-NEXT: addq %rdi, %rdi ; FALLBACK14-NEXT: shlxq %rax, %rdi, %rdi ; FALLBACK14-NEXT: orq %rsi, %rdi ; FALLBACK14-NEXT: addq %rcx, %rcx ; FALLBACK14-NEXT: shlxq %rax, %rcx, %rcx ; FALLBACK14-NEXT: orq %r9, %rcx ; FALLBACK14-NEXT: addq %r8, %r8 ; FALLBACK14-NEXT: shlxq %rax, %r8, %rax ; FALLBACK14-NEXT: orq %r10, %rax ; FALLBACK14-NEXT: movq %r11, 24(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) ; FALLBACK14-NEXT: movq %rcx, 16(%rdx) ; FALLBACK14-NEXT: movq %rdi, (%rdx) ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_32bytes_dwordOff: ; FALLBACK15: # %bb.0: ; FALLBACK15-NEXT: vmovups (%rdi), %xmm0 ; FALLBACK15-NEXT: movq 16(%rdi), %rax ; FALLBACK15-NEXT: movq 24(%rdi), %rdi ; FALLBACK15-NEXT: movzbl (%rsi), %esi ; FALLBACK15-NEXT: movl %esi, %ecx ; FALLBACK15-NEXT: shlb $5, %cl ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: sarq $63, %rdi ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: andb $6, %sil ; FALLBACK15-NEXT: movzbl %sil, %eax ; FALLBACK15-NEXT: movq -48(%rsp,%rax,4), %rsi ; FALLBACK15-NEXT: movq -56(%rsp,%rax,4), %rdi ; FALLBACK15-NEXT: movq %rdi, %r8 ; FALLBACK15-NEXT: shrdq %cl, %rsi, %r8 ; FALLBACK15-NEXT: movq -72(%rsp,%rax,4), %r9 ; FALLBACK15-NEXT: movq -64(%rsp,%rax,4), %rax ; FALLBACK15-NEXT: movq %rax, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rdi, %r10 ; FALLBACK15-NEXT: shrdq %cl, %rax, %r9 ; FALLBACK15-NEXT: sarxq %rcx, %rsi, %rax ; FALLBACK15-NEXT: movq %r10, 8(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) ; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r9, (%rdx) ; FALLBACK15-NEXT: retq ; ; X86-SSE2-LABEL: ashr_32bytes_dwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $92, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%eax), %edi ; X86-SSE2-NEXT: movl 12(%eax), %ebx ; X86-SSE2-NEXT: movl 16(%eax), %ebp ; X86-SSE2-NEXT: movl 20(%eax), %esi ; X86-SSE2-NEXT: movl 24(%eax), %edx ; X86-SSE2-NEXT: movl 28(%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movzbl (%eax), %eax ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: sarl $31, %ecx ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andl $7, %eax ; X86-SSE2-NEXT: movl 16(%esp,%eax,4), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 20(%esp,%eax,4), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 28(%esp,%eax,4), %esi ; X86-SSE2-NEXT: movl 24(%esp,%eax,4), %edi ; X86-SSE2-NEXT: movl 36(%esp,%eax,4), %ebx ; X86-SSE2-NEXT: movl 32(%esp,%eax,4), %ebp ; X86-SSE2-NEXT: movl 44(%esp,%eax,4), %edx ; X86-SSE2-NEXT: movl 40(%esp,%eax,4), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl %edx, 28(%eax) ; X86-SSE2-NEXT: movl %ebp, 16(%eax) ; X86-SSE2-NEXT: movl %ebx, 20(%eax) ; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: addl $92, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: ashr_32bytes_dwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: pushl %ebx ; X86-SSE42-NEXT: pushl %edi ; X86-SSE42-NEXT: pushl %esi ; X86-SSE42-NEXT: subl $64, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movl 16(%edx), %esi ; X86-SSE42-NEXT: movl 20(%edx), %edi ; X86-SSE42-NEXT: movl 24(%edx), %ebx ; X86-SSE42-NEXT: movl 28(%edx), %edx ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, (%esp) ; X86-SSE42-NEXT: sarl $31, %edx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: andl $7, %ecx ; X86-SSE42-NEXT: movups (%esp,%ecx,4), %xmm0 ; X86-SSE42-NEXT: movups 16(%esp,%ecx,4), %xmm1 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $64, %esp ; X86-SSE42-NEXT: popl %esi ; X86-SSE42-NEXT: popl %edi ; X86-SSE42-NEXT: popl %ebx ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: ashr_32bytes_dwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: pushl %esi ; X86-AVX-NEXT: subl $64, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %xmm0 ; X86-AVX-NEXT: movl 16(%edx), %esi ; X86-AVX-NEXT: movl 20(%edx), %edi ; X86-AVX-NEXT: movl 24(%edx), %ebx ; X86-AVX-NEXT: movl 28(%edx), %edx ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovaps %xmm0, (%esp) ; X86-AVX-NEXT: sarl $31, %edx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: andl $7, %ecx ; X86-AVX-NEXT: vmovups (%esp,%ecx,4), %xmm0 ; X86-AVX-NEXT: vmovups 16(%esp,%ecx,4), %xmm1 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $64, %esp ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: popl %ebx ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %dwordOff = load i256, ptr %dwordOff.ptr, align 1 %bitOff = shl i256 %dwordOff, 5 %res = ashr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @ashr_32bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: ashr_32bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movq (%rdi), %rax ; X64-SSE2-NEXT: movq 8(%rdi), %rcx ; X64-SSE2-NEXT: movq 16(%rdi), %r8 ; X64-SSE2-NEXT: movq 24(%rdi), %rdi ; X64-SSE2-NEXT: movzbl (%rsi), %esi ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: sarq $63, %rdi ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: andl $3, %esi ; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %rax ; X64-SSE2-NEXT: movq -64(%rsp,%rsi,8), %rcx ; X64-SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdi ; X64-SSE2-NEXT: movq -56(%rsp,%rsi,8), %rsi ; X64-SSE2-NEXT: movq %rsi, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) ; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: ashr_32bytes_qwordOff: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movq 16(%rdi), %rax ; X64-SSE42-NEXT: movq 24(%rdi), %rcx ; X64-SSE42-NEXT: movzbl (%rsi), %esi ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: sarq $63, %rcx ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: andl $3, %esi ; X64-SSE42-NEXT: movups -72(%rsp,%rsi,8), %xmm0 ; X64-SSE42-NEXT: movups -56(%rsp,%rsi,8), %xmm1 ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: ashr_32bytes_qwordOff: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovups (%rdi), %xmm0 ; X64-AVX-NEXT: movq 16(%rdi), %rax ; X64-AVX-NEXT: movq 24(%rdi), %rcx ; X64-AVX-NEXT: movzbl (%rsi), %esi ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: sarq $63, %rcx ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $3, %esi ; X64-AVX-NEXT: vmovups -72(%rsp,%rsi,8), %xmm0 ; X64-AVX-NEXT: vmovups -56(%rsp,%rsi,8), %xmm1 ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX-NEXT: retq ; ; X86-SSE2-LABEL: ashr_32bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $92, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%eax), %edi ; X86-SSE2-NEXT: movl 12(%eax), %ebx ; X86-SSE2-NEXT: movl 16(%eax), %ebp ; X86-SSE2-NEXT: movl 20(%eax), %esi ; X86-SSE2-NEXT: movl 24(%eax), %edx ; X86-SSE2-NEXT: movl 28(%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movzbl (%eax), %eax ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: sarl $31, %ecx ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andl $3, %eax ; X86-SSE2-NEXT: movl 16(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 20(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 28(%esp,%eax,8), %esi ; X86-SSE2-NEXT: movl 24(%esp,%eax,8), %edi ; X86-SSE2-NEXT: movl 36(%esp,%eax,8), %ebx ; X86-SSE2-NEXT: movl 32(%esp,%eax,8), %ebp ; X86-SSE2-NEXT: movl 44(%esp,%eax,8), %edx ; X86-SSE2-NEXT: movl 40(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl %edx, 28(%eax) ; X86-SSE2-NEXT: movl %ebp, 16(%eax) ; X86-SSE2-NEXT: movl %ebx, 20(%eax) ; X86-SSE2-NEXT: movl %edi, 8(%eax) ; X86-SSE2-NEXT: movl %esi, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: addl $92, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: ashr_32bytes_qwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: pushl %ebx ; X86-SSE42-NEXT: pushl %edi ; X86-SSE42-NEXT: pushl %esi ; X86-SSE42-NEXT: subl $64, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movl 16(%edx), %esi ; X86-SSE42-NEXT: movl 20(%edx), %edi ; X86-SSE42-NEXT: movl 24(%edx), %ebx ; X86-SSE42-NEXT: movl 28(%edx), %edx ; X86-SSE42-NEXT: movzbl (%ecx), %ecx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, (%esp) ; X86-SSE42-NEXT: sarl $31, %edx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: andl $3, %ecx ; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0 ; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1 ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $64, %esp ; X86-SSE42-NEXT: popl %esi ; X86-SSE42-NEXT: popl %edi ; X86-SSE42-NEXT: popl %ebx ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: ashr_32bytes_qwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: pushl %esi ; X86-AVX-NEXT: subl $64, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %xmm0 ; X86-AVX-NEXT: movl 16(%edx), %esi ; X86-AVX-NEXT: movl 20(%edx), %edi ; X86-AVX-NEXT: movl 24(%edx), %ebx ; X86-AVX-NEXT: movl 28(%edx), %edx ; X86-AVX-NEXT: movzbl (%ecx), %ecx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovaps %xmm0, (%esp) ; X86-AVX-NEXT: sarl $31, %edx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: andl $3, %ecx ; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0 ; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $64, %esp ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: popl %ebx ; X86-AVX-NEXT: retl %src = load i256, ptr %src.ptr, align 1 %qwordOff = load i256, ptr %qwordOff.ptr, align 1 %bitOff = shl i256 %qwordOff, 6 %res = ashr i256 %src, %bitOff store i256 %res, ptr %dst, align 1 ret void } define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: lshr_64bytes: ; FALLBACK0: # %bb.0: ; FALLBACK0-NEXT: pushq %r15 ; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %r13 ; FALLBACK0-NEXT: pushq %r12 ; FALLBACK0-NEXT: pushq %rbx ; FALLBACK0-NEXT: movq (%rdi), %rax ; FALLBACK0-NEXT: movq 8(%rdi), %rcx ; FALLBACK0-NEXT: movq 16(%rdi), %r8 ; FALLBACK0-NEXT: movq 24(%rdi), %r9 ; FALLBACK0-NEXT: movq 32(%rdi), %r10 ; FALLBACK0-NEXT: movq 40(%rdi), %r11 ; FALLBACK0-NEXT: movq 48(%rdi), %rbx ; FALLBACK0-NEXT: movq 56(%rdi), %r14 ; FALLBACK0-NEXT: movl (%rsi), %edi ; FALLBACK0-NEXT: xorps %xmm0, %xmm0 ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: leal (,%rdi,8), %eax ; FALLBACK0-NEXT: andl $56, %eax ; FALLBACK0-NEXT: andl $56, %edi ; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 ; FALLBACK0-NEXT: movq %r8, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx ; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r9 ; FALLBACK0-NEXT: orq %r11, %r9 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r10 ; FALLBACK0-NEXT: addq %r8, %r8 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 ; FALLBACK0-NEXT: orq %r10, %r8 ; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 ; FALLBACK0-NEXT: movq %r10, %r15 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r15 ; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 ; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r11 ; FALLBACK0-NEXT: orq %r15, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx ; FALLBACK0-NEXT: addq %r10, %r10 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: orq %rbx, %r10 ; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx ; FALLBACK0-NEXT: movq %rbx, %r12 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r12 ; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 ; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r15 ; FALLBACK0-NEXT: orq %r12, %r15 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r14 ; FALLBACK0-NEXT: addq %rbx, %rbx ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rbx ; FALLBACK0-NEXT: orq %r14, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r13 ; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi ; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r14 ; FALLBACK0-NEXT: orq %r13, %r14 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rdi ; FALLBACK0-NEXT: movq %rdi, 56(%rdx) ; FALLBACK0-NEXT: movq %r14, 48(%rdx) ; FALLBACK0-NEXT: movq %rbx, 32(%rdx) ; FALLBACK0-NEXT: movq %r15, 40(%rdx) ; FALLBACK0-NEXT: movq %r10, 16(%rdx) ; FALLBACK0-NEXT: movq %r11, 24(%rdx) ; FALLBACK0-NEXT: movq %r8, (%rdx) ; FALLBACK0-NEXT: movq %r9, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: popq %r12 ; FALLBACK0-NEXT: popq %r13 ; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: popq %r15 ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: lshr_64bytes: ; FALLBACK1: # %bb.0: ; FALLBACK1-NEXT: pushq %r15 ; FALLBACK1-NEXT: pushq %r14 ; FALLBACK1-NEXT: pushq %rbx ; FALLBACK1-NEXT: movq (%rdi), %rcx ; FALLBACK1-NEXT: movq 8(%rdi), %r8 ; FALLBACK1-NEXT: movq 16(%rdi), %r9 ; FALLBACK1-NEXT: movq 24(%rdi), %r10 ; FALLBACK1-NEXT: movq 32(%rdi), %r11 ; FALLBACK1-NEXT: movq 40(%rdi), %rbx ; FALLBACK1-NEXT: movq 48(%rdi), %r14 ; FALLBACK1-NEXT: movq 56(%rdi), %rdi ; FALLBACK1-NEXT: movl (%rsi), %eax ; FALLBACK1-NEXT: xorps %xmm0, %xmm0 ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: leal (,%rax,8), %ecx ; FALLBACK1-NEXT: andl $56, %ecx ; FALLBACK1-NEXT: andl $56, %eax ; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi ; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi ; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 ; FALLBACK1-NEXT: movq %r9, %r8 ; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 ; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 ; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 ; FALLBACK1-NEXT: movq %r11, %rbx ; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx ; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 ; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 ; FALLBACK1-NEXT: movq %r14, %r15 ; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 ; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 ; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 ; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK1-NEXT: shrq %cl, %rax ; FALLBACK1-NEXT: movq %r11, 48(%rdx) ; FALLBACK1-NEXT: movq %rax, 56(%rdx) ; FALLBACK1-NEXT: movq %r10, 32(%rdx) ; FALLBACK1-NEXT: movq %r15, 40(%rdx) ; FALLBACK1-NEXT: movq %rdi, 16(%rdx) ; FALLBACK1-NEXT: movq %rbx, 24(%rdx) ; FALLBACK1-NEXT: movq %rsi, (%rdx) ; FALLBACK1-NEXT: movq %r8, 8(%rdx) ; FALLBACK1-NEXT: popq %rbx ; FALLBACK1-NEXT: popq %r14 ; FALLBACK1-NEXT: popq %r15 ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: lshr_64bytes: ; FALLBACK2: # %bb.0: ; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 ; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax ; FALLBACK2-NEXT: movq (%rdi), %rcx ; FALLBACK2-NEXT: movq 8(%rdi), %r8 ; FALLBACK2-NEXT: movq 16(%rdi), %r9 ; FALLBACK2-NEXT: movq 24(%rdi), %r10 ; FALLBACK2-NEXT: movq 32(%rdi), %r11 ; FALLBACK2-NEXT: movq 40(%rdi), %rbx ; FALLBACK2-NEXT: movq 48(%rdi), %r14 ; FALLBACK2-NEXT: movq 56(%rdi), %rdi ; FALLBACK2-NEXT: movl (%rsi), %eax ; FALLBACK2-NEXT: xorps %xmm0, %xmm0 ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx ; FALLBACK2-NEXT: andl $56, %eax ; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi ; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 ; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx ; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 ; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi ; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 ; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 ; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 ; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 ; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 ; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp ; FALLBACK2-NEXT: movl %ecx, %r12d ; FALLBACK2-NEXT: notb %r12b ; FALLBACK2-NEXT: addq %r9, %r9 ; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi ; FALLBACK2-NEXT: orq %r13, %rdi ; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx ; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 ; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx ; FALLBACK2-NEXT: addq %r10, %r10 ; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 ; FALLBACK2-NEXT: orq %r8, %r10 ; FALLBACK2-NEXT: addq %rsi, %rsi ; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi ; FALLBACK2-NEXT: orq %r11, %rsi ; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 ; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 ; FALLBACK2-NEXT: orq %r15, %r8 ; FALLBACK2-NEXT: addq %r14, %r14 ; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 ; FALLBACK2-NEXT: orq %rbp, %r11 ; FALLBACK2-NEXT: addq %rax, %rax ; FALLBACK2-NEXT: shlxq %r12, %rax, %rax ; FALLBACK2-NEXT: orq %r13, %rax ; FALLBACK2-NEXT: movq %rcx, 56(%rdx) ; FALLBACK2-NEXT: movq %rax, 48(%rdx) ; FALLBACK2-NEXT: movq %r11, 32(%rdx) ; FALLBACK2-NEXT: movq %r8, 40(%rdx) ; FALLBACK2-NEXT: movq %rsi, 16(%rdx) ; FALLBACK2-NEXT: movq %r10, 24(%rdx) ; FALLBACK2-NEXT: movq %rdi, (%rdx) ; FALLBACK2-NEXT: movq %r9, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 ; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 ; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: lshr_64bytes: ; FALLBACK3: # %bb.0: ; FALLBACK3-NEXT: pushq %r15 ; FALLBACK3-NEXT: pushq %r14 ; FALLBACK3-NEXT: pushq %rbx ; FALLBACK3-NEXT: movq (%rdi), %rcx ; FALLBACK3-NEXT: movq 8(%rdi), %r8 ; FALLBACK3-NEXT: movq 16(%rdi), %r9 ; FALLBACK3-NEXT: movq 24(%rdi), %r10 ; FALLBACK3-NEXT: movq 32(%rdi), %r11 ; FALLBACK3-NEXT: movq 40(%rdi), %rbx ; FALLBACK3-NEXT: movq 48(%rdi), %r14 ; FALLBACK3-NEXT: movq 56(%rdi), %rdi ; FALLBACK3-NEXT: movl (%rsi), %eax ; FALLBACK3-NEXT: xorps %xmm0, %xmm0 ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: leal (,%rax,8), %ecx ; FALLBACK3-NEXT: andl $56, %ecx ; FALLBACK3-NEXT: andl $56, %eax ; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi ; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi ; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 ; FALLBACK3-NEXT: movq %r9, %r8 ; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 ; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 ; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 ; FALLBACK3-NEXT: movq %r11, %rbx ; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx ; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 ; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 ; FALLBACK3-NEXT: movq %r14, %r15 ; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 ; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 ; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 ; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax ; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi ; FALLBACK3-NEXT: movq %r11, 48(%rdx) ; FALLBACK3-NEXT: movq %r10, 32(%rdx) ; FALLBACK3-NEXT: movq %r15, 40(%rdx) ; FALLBACK3-NEXT: movq %rdi, 16(%rdx) ; FALLBACK3-NEXT: movq %rbx, 24(%rdx) ; FALLBACK3-NEXT: movq %rsi, (%rdx) ; FALLBACK3-NEXT: movq %r8, 8(%rdx) ; FALLBACK3-NEXT: movq %rax, 56(%rdx) ; FALLBACK3-NEXT: popq %rbx ; FALLBACK3-NEXT: popq %r14 ; FALLBACK3-NEXT: popq %r15 ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: lshr_64bytes: ; FALLBACK4: # %bb.0: ; FALLBACK4-NEXT: pushq %rbp ; FALLBACK4-NEXT: pushq %r15 ; FALLBACK4-NEXT: pushq %r14 ; FALLBACK4-NEXT: pushq %r13 ; FALLBACK4-NEXT: pushq %r12 ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: pushq %rax ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK4-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK4-NEXT: movl (%rsi), %r8d ; FALLBACK4-NEXT: xorps %xmm4, %xmm4 ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: leal (,%r8,8), %eax ; FALLBACK4-NEXT: andl $56, %eax ; FALLBACK4-NEXT: andl $56, %r8d ; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10 ; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rdi ; FALLBACK4-NEXT: orq %r10, %rdi ; FALLBACK4-NEXT: movq -104(%rsp,%r8), %r10 ; FALLBACK4-NEXT: movq %r10, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %rbx ; FALLBACK4-NEXT: movq -96(%rsp,%r8), %r12 ; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r11 ; FALLBACK4-NEXT: orq %rbx, %r11 ; FALLBACK4-NEXT: movq -112(%rsp,%r8), %rbx ; FALLBACK4-NEXT: movq %rbx, %r14 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r14 ; FALLBACK4-NEXT: addq %r10, %r10 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: orq %r14, %r10 ; FALLBACK4-NEXT: movq -88(%rsp,%r8), %r14 ; FALLBACK4-NEXT: movq %r14, %r13 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r13 ; FALLBACK4-NEXT: movq -80(%rsp,%r8), %rbp ; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r15 ; FALLBACK4-NEXT: orq %r13, %r15 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r12 ; FALLBACK4-NEXT: addq %r14, %r14 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r14 ; FALLBACK4-NEXT: orq %r12, %r14 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %rbp ; FALLBACK4-NEXT: movq -72(%rsp,%r8), %r8 ; FALLBACK4-NEXT: leaq (%r8,%r8), %r12 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r12 ; FALLBACK4-NEXT: orq %rbp, %r12 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r9 ; FALLBACK4-NEXT: addq %rbx, %rbx ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rbx ; FALLBACK4-NEXT: orq %r9, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 ; FALLBACK4-NEXT: movq %r8, 56(%rdx) ; FALLBACK4-NEXT: movq %rbx, 8(%rdx) ; FALLBACK4-NEXT: movq %r12, 48(%rdx) ; FALLBACK4-NEXT: movq %r14, 32(%rdx) ; FALLBACK4-NEXT: movq %r15, 40(%rdx) ; FALLBACK4-NEXT: movq %r10, 16(%rdx) ; FALLBACK4-NEXT: movq %r11, 24(%rdx) ; FALLBACK4-NEXT: movq %rdi, (%rdx) ; FALLBACK4-NEXT: addq $8, %rsp ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: popq %r12 ; FALLBACK4-NEXT: popq %r13 ; FALLBACK4-NEXT: popq %r14 ; FALLBACK4-NEXT: popq %r15 ; FALLBACK4-NEXT: popq %rbp ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: lshr_64bytes: ; FALLBACK5: # %bb.0: ; FALLBACK5-NEXT: pushq %r15 ; FALLBACK5-NEXT: pushq %r14 ; FALLBACK5-NEXT: pushq %rbx ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK5-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK5-NEXT: movl (%rsi), %eax ; FALLBACK5-NEXT: xorps %xmm4, %xmm4 ; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: leal (,%rax,8), %ecx ; FALLBACK5-NEXT: andl $56, %ecx ; FALLBACK5-NEXT: andl $56, %eax ; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK5-NEXT: movq %r9, %rsi ; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK5-NEXT: movq %r10, %r8 ; FALLBACK5-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK5-NEXT: movq %r11, %rbx ; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK5-NEXT: movq %rax, %r15 ; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: shrq %cl, %r11 ; FALLBACK5-NEXT: movq %r15, 8(%rdx) ; FALLBACK5-NEXT: movq %r9, 48(%rdx) ; FALLBACK5-NEXT: movq %r11, 56(%rdx) ; FALLBACK5-NEXT: movq %rdi, 32(%rdx) ; FALLBACK5-NEXT: movq %rbx, 40(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) ; FALLBACK5-NEXT: movq %r14, (%rdx) ; FALLBACK5-NEXT: popq %rbx ; FALLBACK5-NEXT: popq %r14 ; FALLBACK5-NEXT: popq %r15 ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: lshr_64bytes: ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx ; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK6-NEXT: movl (%rsi), %eax ; FALLBACK6-NEXT: xorps %xmm4, %xmm4 ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: leal (,%rax,8), %esi ; FALLBACK6-NEXT: andl $56, %esi ; FALLBACK6-NEXT: andl $56, %eax ; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 ; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 ; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 ; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 ; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 ; FALLBACK6-NEXT: movl %esi, %ebx ; FALLBACK6-NEXT: notb %bl ; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 ; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK6-NEXT: orq %r11, %r8 ; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 ; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK6-NEXT: orq %r12, %r11 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi ; FALLBACK6-NEXT: addq %rdi, %rdi ; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi ; FALLBACK6-NEXT: orq %r9, %rdi ; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 ; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 ; FALLBACK6-NEXT: orq %r14, %r9 ; FALLBACK6-NEXT: addq %r10, %r10 ; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 ; FALLBACK6-NEXT: orq %r15, %r10 ; FALLBACK6-NEXT: addq %rax, %rax ; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax ; FALLBACK6-NEXT: orq %r13, %rax ; FALLBACK6-NEXT: addq %rcx, %rcx ; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx ; FALLBACK6-NEXT: orq %rbp, %rcx ; FALLBACK6-NEXT: movq %rsi, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) ; FALLBACK6-NEXT: movq %rax, 48(%rdx) ; FALLBACK6-NEXT: movq %r10, 32(%rdx) ; FALLBACK6-NEXT: movq %r9, 40(%rdx) ; FALLBACK6-NEXT: movq %rdi, 16(%rdx) ; FALLBACK6-NEXT: movq %r11, 24(%rdx) ; FALLBACK6-NEXT: movq %r8, (%rdx) ; FALLBACK6-NEXT: addq $8, %rsp ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 ; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: lshr_64bytes: ; FALLBACK7: # %bb.0: ; FALLBACK7-NEXT: pushq %r15 ; FALLBACK7-NEXT: pushq %r14 ; FALLBACK7-NEXT: pushq %rbx ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK7-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK7-NEXT: movl (%rsi), %eax ; FALLBACK7-NEXT: xorps %xmm4, %xmm4 ; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: leal (,%rax,8), %ecx ; FALLBACK7-NEXT: andl $56, %ecx ; FALLBACK7-NEXT: andl $56, %eax ; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK7-NEXT: movq %r9, %rsi ; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK7-NEXT: movq %r10, %r8 ; FALLBACK7-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK7-NEXT: movq %r11, %rbx ; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK7-NEXT: movq %rax, %r15 ; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK7-NEXT: movq %r15, 8(%rdx) ; FALLBACK7-NEXT: movq %r9, 48(%rdx) ; FALLBACK7-NEXT: movq %rdi, 32(%rdx) ; FALLBACK7-NEXT: movq %rbx, 40(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rsi, 24(%rdx) ; FALLBACK7-NEXT: movq %r14, (%rdx) ; FALLBACK7-NEXT: movq %r10, 56(%rdx) ; FALLBACK7-NEXT: popq %rbx ; FALLBACK7-NEXT: popq %r14 ; FALLBACK7-NEXT: popq %r15 ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: lshr_64bytes: ; FALLBACK8: # %bb.0: ; FALLBACK8-NEXT: pushq %rbp ; FALLBACK8-NEXT: pushq %r15 ; FALLBACK8-NEXT: pushq %r14 ; FALLBACK8-NEXT: pushq %r13 ; FALLBACK8-NEXT: pushq %r12 ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: pushq %rax ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK8-NEXT: movl (%rsi), %r9d ; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: leal (,%r9,8), %eax ; FALLBACK8-NEXT: andl $56, %eax ; FALLBACK8-NEXT: andl $56, %r9d ; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10 ; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rdi ; FALLBACK8-NEXT: orq %r10, %rdi ; FALLBACK8-NEXT: movq -104(%rsp,%r9), %r10 ; FALLBACK8-NEXT: movq %r10, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %rbx ; FALLBACK8-NEXT: movq -96(%rsp,%r9), %r12 ; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r11 ; FALLBACK8-NEXT: orq %rbx, %r11 ; FALLBACK8-NEXT: movq -112(%rsp,%r9), %rbx ; FALLBACK8-NEXT: movq %rbx, %r14 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r14 ; FALLBACK8-NEXT: addq %r10, %r10 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: orq %r14, %r10 ; FALLBACK8-NEXT: movq -88(%rsp,%r9), %r14 ; FALLBACK8-NEXT: movq %r14, %r13 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r13 ; FALLBACK8-NEXT: movq -80(%rsp,%r9), %rbp ; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r15 ; FALLBACK8-NEXT: orq %r13, %r15 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r12 ; FALLBACK8-NEXT: addq %r14, %r14 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r14 ; FALLBACK8-NEXT: orq %r12, %r14 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %rbp ; FALLBACK8-NEXT: movq -72(%rsp,%r9), %r9 ; FALLBACK8-NEXT: leaq (%r9,%r9), %r12 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r12 ; FALLBACK8-NEXT: orq %rbp, %r12 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 ; FALLBACK8-NEXT: addq %rbx, %rbx ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rbx ; FALLBACK8-NEXT: orq %r8, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r9 ; FALLBACK8-NEXT: movq %r9, 56(%rdx) ; FALLBACK8-NEXT: movq %rbx, 8(%rdx) ; FALLBACK8-NEXT: movq %r12, 48(%rdx) ; FALLBACK8-NEXT: movq %r14, 32(%rdx) ; FALLBACK8-NEXT: movq %r15, 40(%rdx) ; FALLBACK8-NEXT: movq %r10, 16(%rdx) ; FALLBACK8-NEXT: movq %r11, 24(%rdx) ; FALLBACK8-NEXT: movq %rdi, (%rdx) ; FALLBACK8-NEXT: addq $8, %rsp ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: popq %r12 ; FALLBACK8-NEXT: popq %r13 ; FALLBACK8-NEXT: popq %r14 ; FALLBACK8-NEXT: popq %r15 ; FALLBACK8-NEXT: popq %rbp ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: lshr_64bytes: ; FALLBACK9: # %bb.0: ; FALLBACK9-NEXT: pushq %r15 ; FALLBACK9-NEXT: pushq %r14 ; FALLBACK9-NEXT: pushq %rbx ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK9-NEXT: movl (%rsi), %eax ; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: leal (,%rax,8), %ecx ; FALLBACK9-NEXT: andl $56, %ecx ; FALLBACK9-NEXT: andl $56, %eax ; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK9-NEXT: movq %r9, %rsi ; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK9-NEXT: movq %r10, %r8 ; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK9-NEXT: movq %r11, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK9-NEXT: movq %rax, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) ; FALLBACK9-NEXT: movq %r9, 48(%rdx) ; FALLBACK9-NEXT: movq %r11, 56(%rdx) ; FALLBACK9-NEXT: movq %rdi, 32(%rdx) ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) ; FALLBACK9-NEXT: movq %r14, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 ; FALLBACK9-NEXT: popq %r15 ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: lshr_64bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx ; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK10-NEXT: movl (%rsi), %eax ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: leal (,%rax,8), %esi ; FALLBACK10-NEXT: andl $56, %esi ; FALLBACK10-NEXT: andl $56, %eax ; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 ; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 ; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 ; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 ; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 ; FALLBACK10-NEXT: movl %esi, %ebx ; FALLBACK10-NEXT: notb %bl ; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 ; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK10-NEXT: orq %r11, %r8 ; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 ; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK10-NEXT: orq %r12, %r11 ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi ; FALLBACK10-NEXT: addq %rdi, %rdi ; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi ; FALLBACK10-NEXT: orq %r9, %rdi ; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 ; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 ; FALLBACK10-NEXT: orq %r14, %r9 ; FALLBACK10-NEXT: addq %r10, %r10 ; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 ; FALLBACK10-NEXT: orq %r15, %r10 ; FALLBACK10-NEXT: addq %rax, %rax ; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax ; FALLBACK10-NEXT: orq %r13, %rax ; FALLBACK10-NEXT: addq %rcx, %rcx ; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx ; FALLBACK10-NEXT: orq %rbp, %rcx ; FALLBACK10-NEXT: movq %rsi, 56(%rdx) ; FALLBACK10-NEXT: movq %rcx, 8(%rdx) ; FALLBACK10-NEXT: movq %rax, 48(%rdx) ; FALLBACK10-NEXT: movq %r10, 32(%rdx) ; FALLBACK10-NEXT: movq %r9, 40(%rdx) ; FALLBACK10-NEXT: movq %rdi, 16(%rdx) ; FALLBACK10-NEXT: movq %r11, 24(%rdx) ; FALLBACK10-NEXT: movq %r8, (%rdx) ; FALLBACK10-NEXT: addq $8, %rsp ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 ; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: lshr_64bytes: ; FALLBACK11: # %bb.0: ; FALLBACK11-NEXT: pushq %r15 ; FALLBACK11-NEXT: pushq %r14 ; FALLBACK11-NEXT: pushq %rbx ; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK11-NEXT: movl (%rsi), %eax ; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: leal (,%rax,8), %ecx ; FALLBACK11-NEXT: andl $56, %ecx ; FALLBACK11-NEXT: andl $56, %eax ; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK11-NEXT: movq %r9, %rsi ; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK11-NEXT: movq %r10, %r8 ; FALLBACK11-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK11-NEXT: movq %r11, %rbx ; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK11-NEXT: movq %rax, %r15 ; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK11-NEXT: movq %r15, 8(%rdx) ; FALLBACK11-NEXT: movq %r9, 48(%rdx) ; FALLBACK11-NEXT: movq %rdi, 32(%rdx) ; FALLBACK11-NEXT: movq %rbx, 40(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) ; FALLBACK11-NEXT: movq %rsi, 24(%rdx) ; FALLBACK11-NEXT: movq %r14, (%rdx) ; FALLBACK11-NEXT: movq %r10, 56(%rdx) ; FALLBACK11-NEXT: popq %rbx ; FALLBACK11-NEXT: popq %r14 ; FALLBACK11-NEXT: popq %r15 ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: lshr_64bytes: ; FALLBACK12: # %bb.0: ; FALLBACK12-NEXT: pushq %rbp ; FALLBACK12-NEXT: pushq %r15 ; FALLBACK12-NEXT: pushq %r14 ; FALLBACK12-NEXT: pushq %r13 ; FALLBACK12-NEXT: pushq %r12 ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: pushq %rax ; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK12-NEXT: movl (%rsi), %r9d ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: leal (,%r9,8), %eax ; FALLBACK12-NEXT: andl $56, %eax ; FALLBACK12-NEXT: andl $56, %r9d ; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 ; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rdi ; FALLBACK12-NEXT: orq %r10, %rdi ; FALLBACK12-NEXT: movq -104(%rsp,%r9), %r10 ; FALLBACK12-NEXT: movq %r10, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbx ; FALLBACK12-NEXT: movq -96(%rsp,%r9), %r12 ; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 ; FALLBACK12-NEXT: orq %rbx, %r11 ; FALLBACK12-NEXT: movq -112(%rsp,%r9), %rbx ; FALLBACK12-NEXT: movq %rbx, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r14 ; FALLBACK12-NEXT: addq %r10, %r10 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: orq %r14, %r10 ; FALLBACK12-NEXT: movq -88(%rsp,%r9), %r14 ; FALLBACK12-NEXT: movq %r14, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r13 ; FALLBACK12-NEXT: movq -80(%rsp,%r9), %rbp ; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r15 ; FALLBACK12-NEXT: orq %r13, %r15 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r12 ; FALLBACK12-NEXT: addq %r14, %r14 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r14 ; FALLBACK12-NEXT: orq %r12, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbp ; FALLBACK12-NEXT: movq -72(%rsp,%r9), %r9 ; FALLBACK12-NEXT: leaq (%r9,%r9), %r12 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: orq %rbp, %r12 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 ; FALLBACK12-NEXT: addq %rbx, %rbx ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r8, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: movq %r9, 56(%rdx) ; FALLBACK12-NEXT: movq %rbx, 8(%rdx) ; FALLBACK12-NEXT: movq %r12, 48(%rdx) ; FALLBACK12-NEXT: movq %r14, 32(%rdx) ; FALLBACK12-NEXT: movq %r15, 40(%rdx) ; FALLBACK12-NEXT: movq %r10, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %rdi, (%rdx) ; FALLBACK12-NEXT: addq $8, %rsp ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: popq %r12 ; FALLBACK12-NEXT: popq %r13 ; FALLBACK12-NEXT: popq %r14 ; FALLBACK12-NEXT: popq %r15 ; FALLBACK12-NEXT: popq %rbp ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: lshr_64bytes: ; FALLBACK13: # %bb.0: ; FALLBACK13-NEXT: pushq %r15 ; FALLBACK13-NEXT: pushq %r14 ; FALLBACK13-NEXT: pushq %rbx ; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK13-NEXT: movl (%rsi), %edi ; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: leal (,%rdi,8), %ecx ; FALLBACK13-NEXT: andl $56, %ecx ; FALLBACK13-NEXT: andl $56, %edi ; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi ; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9 ; FALLBACK13-NEXT: movq %r9, %rax ; FALLBACK13-NEXT: shrdq %cl, %rsi, %rax ; FALLBACK13-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK13-NEXT: movq %r10, %r8 ; FALLBACK13-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK13-NEXT: movq -80(%rsp,%rdi), %r9 ; FALLBACK13-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK13-NEXT: movq %r11, %rbx ; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK13-NEXT: shrdq %cl, %r11, %rsi ; FALLBACK13-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK13-NEXT: movq -128(%rsp,%rdi), %r14 ; FALLBACK13-NEXT: movq -120(%rsp,%rdi), %rdi ; FALLBACK13-NEXT: movq %rdi, %r15 ; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK13-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: shrq %cl, %r11 ; FALLBACK13-NEXT: movq %r15, 8(%rdx) ; FALLBACK13-NEXT: movq %r9, 48(%rdx) ; FALLBACK13-NEXT: movq %r11, 56(%rdx) ; FALLBACK13-NEXT: movq %rsi, 32(%rdx) ; FALLBACK13-NEXT: movq %rbx, 40(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rax, 24(%rdx) ; FALLBACK13-NEXT: movq %r14, (%rdx) ; FALLBACK13-NEXT: popq %rbx ; FALLBACK13-NEXT: popq %r14 ; FALLBACK13-NEXT: popq %r15 ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: lshr_64bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx ; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK14-NEXT: movl (%rsi), %esi ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: leal (,%rsi,8), %ecx ; FALLBACK14-NEXT: andl $56, %ecx ; FALLBACK14-NEXT: andl $56, %esi ; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11 ; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax ; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12 ; FALLBACK14-NEXT: movq -96(%rsp,%rsi), %r13 ; FALLBACK14-NEXT: shrxq %rcx, %rax, %r9 ; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10 ; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14 ; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15 ; FALLBACK14-NEXT: movl %ecx, %ebx ; FALLBACK14-NEXT: notb %bl ; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp ; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 ; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK14-NEXT: orq %r11, %r8 ; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 ; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK14-NEXT: orq %r12, %r11 ; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 ; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13 ; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp ; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi ; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx ; FALLBACK14-NEXT: addq %rdi, %rdi ; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi ; FALLBACK14-NEXT: orq %r9, %rdi ; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 ; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 ; FALLBACK14-NEXT: orq %r14, %r9 ; FALLBACK14-NEXT: addq %r10, %r10 ; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 ; FALLBACK14-NEXT: orq %r15, %r10 ; FALLBACK14-NEXT: addq %rsi, %rsi ; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi ; FALLBACK14-NEXT: orq %r13, %rsi ; FALLBACK14-NEXT: addq %rax, %rax ; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax ; FALLBACK14-NEXT: orq %rbp, %rax ; FALLBACK14-NEXT: movq %rcx, 56(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) ; FALLBACK14-NEXT: movq %rsi, 48(%rdx) ; FALLBACK14-NEXT: movq %r10, 32(%rdx) ; FALLBACK14-NEXT: movq %r9, 40(%rdx) ; FALLBACK14-NEXT: movq %rdi, 16(%rdx) ; FALLBACK14-NEXT: movq %r11, 24(%rdx) ; FALLBACK14-NEXT: movq %r8, (%rdx) ; FALLBACK14-NEXT: addq $8, %rsp ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 ; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: lshr_64bytes: ; FALLBACK15: # %bb.0: ; FALLBACK15-NEXT: pushq %r15 ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx ; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK15-NEXT: movl (%rsi), %eax ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: leal (,%rax,8), %ecx ; FALLBACK15-NEXT: andl $56, %ecx ; FALLBACK15-NEXT: andl $56, %eax ; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK15-NEXT: movq %r9, %rsi ; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK15-NEXT: movq %r10, %r8 ; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK15-NEXT: movq %r11, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK15-NEXT: movq %rax, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) ; FALLBACK15-NEXT: movq %rdi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) ; FALLBACK15-NEXT: movq %rsi, 24(%rdx) ; FALLBACK15-NEXT: movq %r14, (%rdx) ; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx ; FALLBACK15-NEXT: popq %r14 ; FALLBACK15-NEXT: popq %r15 ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; ; FALLBACK16-LABEL: lshr_64bytes: ; FALLBACK16: # %bb.0: ; FALLBACK16-NEXT: pushl %ebp ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $204, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl (%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 4(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 8(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 12(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 16(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 20(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 24(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 28(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 32(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 36(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 40(%eax), %ebp ; FALLBACK16-NEXT: movl 44(%eax), %ebx ; FALLBACK16-NEXT: movl 48(%eax), %edi ; FALLBACK16-NEXT: movl 52(%eax), %esi ; FALLBACK16-NEXT: movl 56(%eax), %edx ; FALLBACK16-NEXT: movl 60(%eax), %ecx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl (%eax), %eax ; FALLBACK16-NEXT: xorps %xmm0, %xmm0 ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %eax, %esi ; FALLBACK16-NEXT: andl $60, %esi ; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx ; FALLBACK16-NEXT: shll $3, %eax ; FALLBACK16-NEXT: andl $24, %eax ; FALLBACK16-NEXT: movl %edx, %edi ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK16-NEXT: movb %al, %ch ; FALLBACK16-NEXT: notb %ch ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %edi, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: addl %edx, %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: orl %edi, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx ; FALLBACK16-NEXT: movl %edx, %ebp ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi ; FALLBACK16-NEXT: leal (%edi,%edi), %ebx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %ebp, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: addl %edx, %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: orl %ebx, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx ; FALLBACK16-NEXT: movl %ebx, %ebp ; FALLBACK16-NEXT: movl %eax, %edx ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: addl %eax, %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: orl %ebp, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: addl %ebx, %ebx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %edi, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx ; FALLBACK16-NEXT: movl %ebx, %ebp ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi ; FALLBACK16-NEXT: leal (%edi,%edi), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: orl %ebp, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: addl %ebx, %ebx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %eax, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx ; FALLBACK16-NEXT: movl %ebx, %ebp ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx ; FALLBACK16-NEXT: leal (%edx,%edx), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: orl %ebp, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: addl %ebx, %ebx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %edi, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK16-NEXT: movl %edi, %ebp ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %ebp, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: addl %edi, %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: orl %edx, %edi ; FALLBACK16-NEXT: movl %esi, %edx ; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK16-NEXT: movl %esi, %ebx ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax ; FALLBACK16-NEXT: leal (%eax,%eax), %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %ebx, %ebp ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: orl %ebx, %esi ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx ; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: orl %eax, %edx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl %ebx, 60(%eax) ; FALLBACK16-NEXT: movl %edx, 56(%eax) ; FALLBACK16-NEXT: movl %esi, 48(%eax) ; FALLBACK16-NEXT: movl %ebp, 52(%eax) ; FALLBACK16-NEXT: movl %edi, 40(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 44(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 32(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 36(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 24(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 28(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 16(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 20(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 8(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 12(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, (%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 4(%eax) ; FALLBACK16-NEXT: addl $204, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx ; FALLBACK16-NEXT: popl %ebp ; FALLBACK16-NEXT: retl ; ; FALLBACK17-LABEL: lshr_64bytes: ; FALLBACK17: # %bb.0: ; FALLBACK17-NEXT: pushl %ebp ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $188, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl (%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 4(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 8(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 12(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 16(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 20(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 24(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 28(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 32(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 36(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: movl 40(%ecx), %ebp ; FALLBACK17-NEXT: movl 44(%ecx), %ebx ; FALLBACK17-NEXT: movl 48(%ecx), %edi ; FALLBACK17-NEXT: movl 52(%ecx), %esi ; FALLBACK17-NEXT: movl 56(%ecx), %edx ; FALLBACK17-NEXT: movl 60(%ecx), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl (%ecx), %ecx ; FALLBACK17-NEXT: xorps %xmm0, %xmm0 ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ecx, %ebp ; FALLBACK17-NEXT: andl $60, %ebp ; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shll $3, %ecx ; FALLBACK17-NEXT: andl $24, %ecx ; FALLBACK17-NEXT: shrdl %cl, %edx, %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %esi ; FALLBACK17-NEXT: shrdl %cl, %edi, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %edi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi ; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl %esi, %edx ; FALLBACK17-NEXT: shrdl %cl, %eax, %edi ; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edi ; FALLBACK17-NEXT: shrdl %cl, %esi, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx ; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edi ; FALLBACK17-NEXT: shrdl %cl, %edx, %edi ; FALLBACK17-NEXT: shrdl %cl, %eax, %esi ; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx ; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK17-NEXT: movl %edx, 56(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK17-NEXT: shrl %cl, %eax ; FALLBACK17-NEXT: movl %eax, 60(%ebp) ; FALLBACK17-NEXT: movl %esi, 48(%ebp) ; FALLBACK17-NEXT: movl %edi, 52(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 40(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 44(%ebp) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 32(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 36(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 24(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 28(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 16(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 20(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 8(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 12(%ebp) ; FALLBACK17-NEXT: movl %ebx, (%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 4(%ebp) ; FALLBACK17-NEXT: addl $188, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx ; FALLBACK17-NEXT: popl %ebp ; FALLBACK17-NEXT: retl ; ; FALLBACK18-LABEL: lshr_64bytes: ; FALLBACK18: # %bb.0: ; FALLBACK18-NEXT: pushl %ebp ; FALLBACK18-NEXT: pushl %ebx ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $204, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl (%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 4(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 8(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 12(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 16(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 20(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 24(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 28(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 32(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 36(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 40(%eax), %ebp ; FALLBACK18-NEXT: movl 44(%eax), %ebx ; FALLBACK18-NEXT: movl 48(%eax), %edi ; FALLBACK18-NEXT: movl 52(%eax), %esi ; FALLBACK18-NEXT: movl 56(%eax), %edx ; FALLBACK18-NEXT: movl 60(%eax), %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl (%eax), %eax ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: leal (,%eax,8), %edx ; FALLBACK18-NEXT: andl $24, %edx ; FALLBACK18-NEXT: andl $60, %ecx ; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, %esi, %edi ; FALLBACK18-NEXT: movl %edx, %ebx ; FALLBACK18-NEXT: notb %bl ; FALLBACK18-NEXT: leal (%eax,%eax), %ebp ; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi ; FALLBACK18-NEXT: addl %esi, %esi ; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi ; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi ; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi ; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi ; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi ; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl %ecx, %edi ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %esi, %esi ; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK18-NEXT: orl %ecx, %esi ; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp ; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx ; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax ; FALLBACK18-NEXT: shrxl %edx, %eax, %edi ; FALLBACK18-NEXT: orl %edi, %ecx ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: addl %eax, %eax ; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp ; FALLBACK18-NEXT: shrxl %edx, %ebp, %edx ; FALLBACK18-NEXT: addl %ebp, %ebp ; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx ; FALLBACK18-NEXT: orl %eax, %ebx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl %edx, 60(%eax) ; FALLBACK18-NEXT: movl %ebx, 56(%eax) ; FALLBACK18-NEXT: movl %edi, 48(%eax) ; FALLBACK18-NEXT: movl %ecx, 52(%eax) ; FALLBACK18-NEXT: movl %esi, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 32(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 36(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 24(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 28(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 16(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 20(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 8(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 12(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, (%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 4(%eax) ; FALLBACK18-NEXT: addl $204, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi ; FALLBACK18-NEXT: popl %ebx ; FALLBACK18-NEXT: popl %ebp ; FALLBACK18-NEXT: retl ; ; FALLBACK19-LABEL: lshr_64bytes: ; FALLBACK19: # %bb.0: ; FALLBACK19-NEXT: pushl %ebp ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $188, %esp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl (%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 4(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 8(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 12(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 16(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 20(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 24(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 28(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 32(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 36(%ecx), %eax ; FALLBACK19-NEXT: movl %eax, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: movl 40(%ecx), %ebp ; FALLBACK19-NEXT: movl 44(%ecx), %ebx ; FALLBACK19-NEXT: movl 48(%ecx), %edi ; FALLBACK19-NEXT: movl 52(%ecx), %esi ; FALLBACK19-NEXT: movl 56(%ecx), %edx ; FALLBACK19-NEXT: movl 60(%ecx), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl (%ecx), %ecx ; FALLBACK19-NEXT: xorps %xmm0, %xmm0 ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ecx, %ebp ; FALLBACK19-NEXT: andl $60, %ebp ; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shll $3, %ecx ; FALLBACK19-NEXT: andl $24, %ecx ; FALLBACK19-NEXT: shrdl %cl, %edx, %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %esi ; FALLBACK19-NEXT: shrdl %cl, %edi, %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %esi, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %edi ; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %edi, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx ; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %edi ; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %esi, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl %edi, %edx ; FALLBACK19-NEXT: shrdl %cl, %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %edi, %esi ; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp ; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl %eax, 56(%ebp) ; FALLBACK19-NEXT: movl %esi, 48(%ebp) ; FALLBACK19-NEXT: movl %edx, 52(%ebp) ; FALLBACK19-NEXT: movl %ebx, 40(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 44(%ebp) ; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 32(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 36(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 24(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 28(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 16(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 20(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 8(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 12(%ebp) ; FALLBACK19-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: shrdl %cl, %edx, %edi ; FALLBACK19-NEXT: movl %edi, (%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 4(%ebp) ; FALLBACK19-NEXT: movl %eax, 60(%ebp) ; FALLBACK19-NEXT: addl $188, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx ; FALLBACK19-NEXT: popl %ebp ; FALLBACK19-NEXT: retl ; ; FALLBACK20-LABEL: lshr_64bytes: ; FALLBACK20: # %bb.0: ; FALLBACK20-NEXT: pushl %ebp ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi ; FALLBACK20-NEXT: subl $204, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK20-NEXT: movl (%eax), %eax ; FALLBACK20-NEXT: xorps %xmm4, %xmm4 ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %eax, %esi ; FALLBACK20-NEXT: andl $60, %esi ; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx ; FALLBACK20-NEXT: shll $3, %eax ; FALLBACK20-NEXT: andl $24, %eax ; FALLBACK20-NEXT: movl %edx, %edi ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK20-NEXT: movb %al, %ch ; FALLBACK20-NEXT: notb %ch ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %edi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: addl %edx, %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: orl %edi, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx ; FALLBACK20-NEXT: movl %edx, %ebp ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi ; FALLBACK20-NEXT: leal (%edi,%edi), %ebx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %ebp, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: addl %edx, %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: orl %ebx, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx ; FALLBACK20-NEXT: movl %ebx, %ebp ; FALLBACK20-NEXT: movl %eax, %edx ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: addl %eax, %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: addl %ebx, %ebx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %edi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx ; FALLBACK20-NEXT: movl %ebx, %ebp ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi ; FALLBACK20-NEXT: leal (%edi,%edi), %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: addl %ebx, %ebx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %eax, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx ; FALLBACK20-NEXT: movl %ebx, %ebp ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx ; FALLBACK20-NEXT: leal (%edx,%edx), %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: addl %ebx, %ebx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %edi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK20-NEXT: movl %edi, %ebp ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %ebp, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %edx ; FALLBACK20-NEXT: addl %edi, %edi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: orl %edx, %edi ; FALLBACK20-NEXT: movl %esi, %edx ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK20-NEXT: movl %esi, %ebx ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax ; FALLBACK20-NEXT: leal (%eax,%eax), %ebp ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: orl %ebx, %ebp ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx ; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: orl %eax, %edx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl %ebx, 60(%eax) ; FALLBACK20-NEXT: movl %edx, 56(%eax) ; FALLBACK20-NEXT: movl %esi, 48(%eax) ; FALLBACK20-NEXT: movl %ebp, 52(%eax) ; FALLBACK20-NEXT: movl %edi, 40(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 44(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 32(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 36(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 24(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 28(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 16(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 20(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 8(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 12(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, (%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 4(%eax) ; FALLBACK20-NEXT: addl $204, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx ; FALLBACK20-NEXT: popl %ebp ; FALLBACK20-NEXT: retl ; ; FALLBACK21-LABEL: lshr_64bytes: ; FALLBACK21: # %bb.0: ; FALLBACK21-NEXT: pushl %ebp ; FALLBACK21-NEXT: pushl %ebx ; FALLBACK21-NEXT: pushl %edi ; FALLBACK21-NEXT: pushl %esi ; FALLBACK21-NEXT: subl $188, %esp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK21-NEXT: movups (%ecx), %xmm0 ; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK21-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK21-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK21-NEXT: movl (%eax), %ecx ; FALLBACK21-NEXT: xorps %xmm4, %xmm4 ; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %ecx, %ebp ; FALLBACK21-NEXT: andl $60, %ebp ; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shll $3, %ecx ; FALLBACK21-NEXT: andl $24, %ecx ; FALLBACK21-NEXT: shrdl %cl, %edx, %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %esi ; FALLBACK21-NEXT: shrdl %cl, %edi, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %edi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi ; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl %esi, %edx ; FALLBACK21-NEXT: shrdl %cl, %eax, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edi ; FALLBACK21-NEXT: shrdl %cl, %esi, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edi ; FALLBACK21-NEXT: shrdl %cl, %edx, %edi ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx ; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK21-NEXT: movl %edx, 56(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK21-NEXT: shrl %cl, %eax ; FALLBACK21-NEXT: movl %eax, 60(%ebp) ; FALLBACK21-NEXT: movl %esi, 48(%ebp) ; FALLBACK21-NEXT: movl %edi, 52(%ebp) ; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 40(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 44(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 32(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 36(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 24(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 28(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 16(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 20(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 8(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 12(%ebp) ; FALLBACK21-NEXT: movl %ebx, (%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 4(%ebp) ; FALLBACK21-NEXT: addl $188, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi ; FALLBACK21-NEXT: popl %ebx ; FALLBACK21-NEXT: popl %ebp ; FALLBACK21-NEXT: retl ; ; FALLBACK22-LABEL: lshr_64bytes: ; FALLBACK22: # %bb.0: ; FALLBACK22-NEXT: pushl %ebp ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi ; FALLBACK22-NEXT: subl $204, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK22-NEXT: movl (%eax), %ecx ; FALLBACK22-NEXT: xorps %xmm4, %xmm4 ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: leal (,%ecx,8), %edx ; FALLBACK22-NEXT: andl $24, %edx ; FALLBACK22-NEXT: andl $60, %ecx ; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, %esi, %edi ; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: notb %bl ; FALLBACK22-NEXT: leal (%eax,%eax), %ebp ; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK22-NEXT: orl %edi, %ebp ; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi ; FALLBACK22-NEXT: addl %esi, %esi ; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %edi, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi ; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi ; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi ; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi ; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl %ecx, %eax ; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi ; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx ; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %esi, %esi ; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %ecx, %esi ; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp ; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx ; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax ; FALLBACK22-NEXT: shrxl %edx, %eax, %edi ; FALLBACK22-NEXT: orl %edi, %ecx ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: addl %eax, %eax ; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi ; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp ; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx ; FALLBACK22-NEXT: addl %ebp, %ebp ; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx ; FALLBACK22-NEXT: orl %eax, %ebx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl %edx, 60(%eax) ; FALLBACK22-NEXT: movl %ebx, 56(%eax) ; FALLBACK22-NEXT: movl %edi, 48(%eax) ; FALLBACK22-NEXT: movl %ecx, 52(%eax) ; FALLBACK22-NEXT: movl %esi, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 32(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 36(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 24(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 28(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 16(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 20(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 8(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 12(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, (%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 4(%eax) ; FALLBACK22-NEXT: addl $204, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx ; FALLBACK22-NEXT: popl %ebp ; FALLBACK22-NEXT: retl ; ; FALLBACK23-LABEL: lshr_64bytes: ; FALLBACK23: # %bb.0: ; FALLBACK23-NEXT: pushl %ebp ; FALLBACK23-NEXT: pushl %ebx ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi ; FALLBACK23-NEXT: subl $188, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK23-NEXT: movups (%ecx), %xmm0 ; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK23-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK23-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK23-NEXT: movl (%eax), %ecx ; FALLBACK23-NEXT: xorps %xmm4, %xmm4 ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %ecx, %ebp ; FALLBACK23-NEXT: andl $60, %ebp ; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shll $3, %ecx ; FALLBACK23-NEXT: andl $24, %ecx ; FALLBACK23-NEXT: shrdl %cl, %edx, %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %esi ; FALLBACK23-NEXT: shrdl %cl, %edi, %esi ; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %esi, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %edi, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %esi ; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx ; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %esi, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl %edi, %edx ; FALLBACK23-NEXT: shrdl %cl, %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %edi, %esi ; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp ; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK23-NEXT: movl %eax, 56(%ebp) ; FALLBACK23-NEXT: movl %esi, 48(%ebp) ; FALLBACK23-NEXT: movl %edx, 52(%ebp) ; FALLBACK23-NEXT: movl %ebx, 40(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 44(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 32(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 36(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 24(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 28(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 16(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 20(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 8(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 12(%ebp) ; FALLBACK23-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK23-NEXT: shrdl %cl, %edx, %edi ; FALLBACK23-NEXT: movl %edi, (%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 4(%ebp) ; FALLBACK23-NEXT: movl %eax, 60(%ebp) ; FALLBACK23-NEXT: addl $188, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi ; FALLBACK23-NEXT: popl %ebx ; FALLBACK23-NEXT: popl %ebp ; FALLBACK23-NEXT: retl ; ; FALLBACK24-LABEL: lshr_64bytes: ; FALLBACK24: # %bb.0: ; FALLBACK24-NEXT: pushl %ebp ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi ; FALLBACK24-NEXT: subl $204, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 ; FALLBACK24-NEXT: movl (%eax), %ecx ; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, %esi ; FALLBACK24-NEXT: andl $60, %esi ; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx ; FALLBACK24-NEXT: shll $3, %ecx ; FALLBACK24-NEXT: andl $24, %ecx ; FALLBACK24-NEXT: movl %edx, %edi ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: leal (%eax,%eax), %ebx ; FALLBACK24-NEXT: movl %ecx, %ebp ; FALLBACK24-NEXT: movb %cl, %ch ; FALLBACK24-NEXT: notb %ch ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %edi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi ; FALLBACK24-NEXT: movl %ebp, %eax ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: addl %edx, %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: orl %edi, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx ; FALLBACK24-NEXT: movl %edx, %ebp ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi ; FALLBACK24-NEXT: leal (%edi,%edi), %ebx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %ebp, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: addl %edx, %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: orl %ebx, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx ; FALLBACK24-NEXT: movl %ebx, %ebp ; FALLBACK24-NEXT: movl %eax, %edx ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: addl %eax, %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax ; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: addl %ebx, %ebx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %edi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx ; FALLBACK24-NEXT: movl %ebx, %ebp ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi ; FALLBACK24-NEXT: leal (%edi,%edi), %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax ; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: addl %ebx, %ebx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %eax, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx ; FALLBACK24-NEXT: movl %ebx, %ebp ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx ; FALLBACK24-NEXT: leal (%edx,%edx), %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax ; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: addl %ebx, %ebx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %edi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK24-NEXT: movl %edi, %ebp ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %ebp, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %edx ; FALLBACK24-NEXT: addl %edi, %edi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: orl %edx, %edi ; FALLBACK24-NEXT: movl %esi, %edx ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK24-NEXT: movl %esi, %ebx ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax ; FALLBACK24-NEXT: leal (%eax,%eax), %ebp ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: orl %ebx, %ebp ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx ; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: orl %eax, %edx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl %ebx, 60(%eax) ; FALLBACK24-NEXT: movl %edx, 56(%eax) ; FALLBACK24-NEXT: movl %esi, 48(%eax) ; FALLBACK24-NEXT: movl %ebp, 52(%eax) ; FALLBACK24-NEXT: movl %edi, 40(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 44(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 32(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 36(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 24(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 28(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 16(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 20(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 8(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 12(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, (%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 4(%eax) ; FALLBACK24-NEXT: addl $204, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx ; FALLBACK24-NEXT: popl %ebp ; FALLBACK24-NEXT: vzeroupper ; FALLBACK24-NEXT: retl ; ; FALLBACK25-LABEL: lshr_64bytes: ; FALLBACK25: # %bb.0: ; FALLBACK25-NEXT: pushl %ebp ; FALLBACK25-NEXT: pushl %ebx ; FALLBACK25-NEXT: pushl %edi ; FALLBACK25-NEXT: pushl %esi ; FALLBACK25-NEXT: subl $188, %esp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1 ; FALLBACK25-NEXT: movl (%eax), %ecx ; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %ecx, %ebp ; FALLBACK25-NEXT: andl $60, %ebp ; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shll $3, %ecx ; FALLBACK25-NEXT: andl $24, %ecx ; FALLBACK25-NEXT: shrdl %cl, %edx, %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %esi ; FALLBACK25-NEXT: shrdl %cl, %edi, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %edi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi ; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl %esi, %edx ; FALLBACK25-NEXT: shrdl %cl, %eax, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edi ; FALLBACK25-NEXT: shrdl %cl, %esi, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edi ; FALLBACK25-NEXT: shrdl %cl, %edx, %edi ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx ; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK25-NEXT: movl %edx, 56(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK25-NEXT: shrl %cl, %eax ; FALLBACK25-NEXT: movl %eax, 60(%ebp) ; FALLBACK25-NEXT: movl %esi, 48(%ebp) ; FALLBACK25-NEXT: movl %edi, 52(%ebp) ; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 40(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 44(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 32(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 36(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 24(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 28(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 16(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 20(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 8(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 12(%ebp) ; FALLBACK25-NEXT: movl %ebx, (%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 4(%ebp) ; FALLBACK25-NEXT: addl $188, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi ; FALLBACK25-NEXT: popl %ebx ; FALLBACK25-NEXT: popl %ebp ; FALLBACK25-NEXT: vzeroupper ; FALLBACK25-NEXT: retl ; ; FALLBACK26-LABEL: lshr_64bytes: ; FALLBACK26: # %bb.0: ; FALLBACK26-NEXT: pushl %ebp ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi ; FALLBACK26-NEXT: subl $204, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1 ; FALLBACK26-NEXT: movl (%eax), %ecx ; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: leal (,%ecx,8), %edx ; FALLBACK26-NEXT: andl $24, %edx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, %esi, %edi ; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: notb %bl ; FALLBACK26-NEXT: leal (%eax,%eax), %ebp ; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK26-NEXT: orl %edi, %ebp ; FALLBACK26-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi ; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK26-NEXT: orl %edi, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi ; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi ; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi ; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi ; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi ; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi ; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi ; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK26-NEXT: orl %eax, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 120(%esp,%ecx), %ebp ; FALLBACK26-NEXT: leal (%ebp,%ebp), %eax ; FALLBACK26-NEXT: shlxl %ebx, %eax, %esi ; FALLBACK26-NEXT: movl 116(%esp,%ecx), %eax ; FALLBACK26-NEXT: shrxl %edx, %eax, %edi ; FALLBACK26-NEXT: orl %edi, %esi ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: addl %eax, %eax ; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi ; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax ; FALLBACK26-NEXT: movl 124(%esp,%ecx), %ecx ; FALLBACK26-NEXT: shrxl %edx, %ecx, %edx ; FALLBACK26-NEXT: addl %ecx, %ecx ; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ebx ; FALLBACK26-NEXT: orl %eax, %ebx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: movl %edx, 60(%ecx) ; FALLBACK26-NEXT: movl %ebx, 56(%ecx) ; FALLBACK26-NEXT: movl %edi, 48(%ecx) ; FALLBACK26-NEXT: movl %esi, 52(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 40(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 44(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 32(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 36(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 24(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 28(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 16(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 20(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 8(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 12(%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, (%ecx) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: movl %eax, 4(%ecx) ; FALLBACK26-NEXT: addl $204, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx ; FALLBACK26-NEXT: popl %ebp ; FALLBACK26-NEXT: vzeroupper ; FALLBACK26-NEXT: retl ; ; FALLBACK27-LABEL: lshr_64bytes: ; FALLBACK27: # %bb.0: ; FALLBACK27-NEXT: pushl %ebp ; FALLBACK27-NEXT: pushl %ebx ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi ; FALLBACK27-NEXT: subl $188, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1 ; FALLBACK27-NEXT: movl (%eax), %ecx ; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %ecx, %ebp ; FALLBACK27-NEXT: andl $60, %ebp ; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shll $3, %ecx ; FALLBACK27-NEXT: andl $24, %ecx ; FALLBACK27-NEXT: shrdl %cl, %edx, %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %esi ; FALLBACK27-NEXT: shrdl %cl, %edi, %esi ; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %esi, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %edi, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %esi ; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx ; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %esi, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl %edi, %edx ; FALLBACK27-NEXT: shrdl %cl, %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %edi, %esi ; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp ; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK27-NEXT: movl %eax, 56(%ebp) ; FALLBACK27-NEXT: movl %esi, 48(%ebp) ; FALLBACK27-NEXT: movl %edx, 52(%ebp) ; FALLBACK27-NEXT: movl %ebx, 40(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 44(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 32(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 36(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 24(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 28(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 16(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 20(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 8(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 12(%ebp) ; FALLBACK27-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK27-NEXT: shrdl %cl, %edx, %edi ; FALLBACK27-NEXT: movl %edi, (%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 4(%ebp) ; FALLBACK27-NEXT: movl %eax, 60(%ebp) ; FALLBACK27-NEXT: addl $188, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi ; FALLBACK27-NEXT: popl %ebx ; FALLBACK27-NEXT: popl %ebp ; FALLBACK27-NEXT: vzeroupper ; FALLBACK27-NEXT: retl ; ; FALLBACK28-LABEL: lshr_64bytes: ; FALLBACK28: # %bb.0: ; FALLBACK28-NEXT: pushl %ebp ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi ; FALLBACK28-NEXT: subl $204, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 ; FALLBACK28-NEXT: movl (%eax), %ecx ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, %esi ; FALLBACK28-NEXT: andl $60, %esi ; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx ; FALLBACK28-NEXT: shll $3, %ecx ; FALLBACK28-NEXT: andl $24, %ecx ; FALLBACK28-NEXT: movl %edx, %edi ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: leal (%eax,%eax), %ebx ; FALLBACK28-NEXT: movl %ecx, %ebp ; FALLBACK28-NEXT: movb %cl, %ch ; FALLBACK28-NEXT: notb %ch ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %edi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi ; FALLBACK28-NEXT: movl %ebp, %eax ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: addl %edx, %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: orl %edi, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx ; FALLBACK28-NEXT: movl %edx, %ebp ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi ; FALLBACK28-NEXT: leal (%edi,%edi), %ebx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %ebp, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: addl %edx, %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: orl %ebx, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx ; FALLBACK28-NEXT: movl %ebx, %ebp ; FALLBACK28-NEXT: movl %eax, %edx ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: addl %eax, %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: addl %ebx, %ebx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %edi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx ; FALLBACK28-NEXT: movl %ebx, %ebp ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi ; FALLBACK28-NEXT: leal (%edi,%edi), %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: addl %ebx, %ebx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %eax, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx ; FALLBACK28-NEXT: movl %ebx, %ebp ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx ; FALLBACK28-NEXT: leal (%edx,%edx), %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: addl %ebx, %ebx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %edi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK28-NEXT: movl %edi, %ebp ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %ebp, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %edx ; FALLBACK28-NEXT: addl %edi, %edi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: orl %edx, %edi ; FALLBACK28-NEXT: movl %esi, %edx ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK28-NEXT: movl %esi, %ebx ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax ; FALLBACK28-NEXT: leal (%eax,%eax), %ebp ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: orl %ebx, %ebp ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx ; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: orl %eax, %edx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl %ebx, 60(%eax) ; FALLBACK28-NEXT: movl %edx, 56(%eax) ; FALLBACK28-NEXT: movl %esi, 48(%eax) ; FALLBACK28-NEXT: movl %ebp, 52(%eax) ; FALLBACK28-NEXT: movl %edi, 40(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 44(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 32(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 36(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 24(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 28(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 16(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 20(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 8(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 12(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, (%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 4(%eax) ; FALLBACK28-NEXT: addl $204, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx ; FALLBACK28-NEXT: popl %ebp ; FALLBACK28-NEXT: vzeroupper ; FALLBACK28-NEXT: retl ; ; FALLBACK29-LABEL: lshr_64bytes: ; FALLBACK29: # %bb.0: ; FALLBACK29-NEXT: pushl %ebp ; FALLBACK29-NEXT: pushl %ebx ; FALLBACK29-NEXT: pushl %edi ; FALLBACK29-NEXT: pushl %esi ; FALLBACK29-NEXT: subl $188, %esp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK29-NEXT: vmovups (%ecx), %zmm0 ; FALLBACK29-NEXT: movl (%eax), %ecx ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %ecx, %ebp ; FALLBACK29-NEXT: andl $60, %ebp ; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shll $3, %ecx ; FALLBACK29-NEXT: andl $24, %ecx ; FALLBACK29-NEXT: shrdl %cl, %edx, %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %esi ; FALLBACK29-NEXT: shrdl %cl, %edi, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %edi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi ; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl %esi, %edx ; FALLBACK29-NEXT: shrdl %cl, %eax, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edi ; FALLBACK29-NEXT: shrdl %cl, %esi, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edi ; FALLBACK29-NEXT: shrdl %cl, %edx, %edi ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx ; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK29-NEXT: movl %edx, 56(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK29-NEXT: shrl %cl, %eax ; FALLBACK29-NEXT: movl %eax, 60(%ebp) ; FALLBACK29-NEXT: movl %esi, 48(%ebp) ; FALLBACK29-NEXT: movl %edi, 52(%ebp) ; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 40(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 44(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 32(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 36(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 24(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 28(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 16(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 20(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 8(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 12(%ebp) ; FALLBACK29-NEXT: movl %ebx, (%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 4(%ebp) ; FALLBACK29-NEXT: addl $188, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi ; FALLBACK29-NEXT: popl %ebx ; FALLBACK29-NEXT: popl %ebp ; FALLBACK29-NEXT: vzeroupper ; FALLBACK29-NEXT: retl ; ; FALLBACK30-LABEL: lshr_64bytes: ; FALLBACK30: # %bb.0: ; FALLBACK30-NEXT: pushl %ebp ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi ; FALLBACK30-NEXT: subl $204, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 ; FALLBACK30-NEXT: movl (%eax), %edx ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: leal (,%edx,8), %ecx ; FALLBACK30-NEXT: andl $24, %ecx ; FALLBACK30-NEXT: andl $60, %edx ; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi ; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi ; FALLBACK30-NEXT: movl %ecx, %ebx ; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: leal (%eax,%eax), %ebp ; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp ; FALLBACK30-NEXT: orl %edi, %ebp ; FALLBACK30-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %ecx, 64(%esp,%edx), %edi ; FALLBACK30-NEXT: addl %esi, %esi ; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK30-NEXT: orl %edi, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 80(%esp,%edx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: movl 76(%esp,%edx), %edi ; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 88(%esp,%edx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: movl 84(%esp,%edx), %edi ; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 96(%esp,%edx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: movl 92(%esp,%edx), %edi ; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 104(%esp,%edx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: movl 100(%esp,%edx), %edi ; FALLBACK30-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 112(%esp,%edx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi ; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK30-NEXT: movl 108(%esp,%edx), %esi ; FALLBACK30-NEXT: shrxl %ecx, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi ; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK30-NEXT: orl %eax, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 120(%esp,%edx), %ebp ; FALLBACK30-NEXT: leal (%ebp,%ebp), %eax ; FALLBACK30-NEXT: shlxl %ebx, %eax, %esi ; FALLBACK30-NEXT: movl 116(%esp,%edx), %eax ; FALLBACK30-NEXT: shrxl %ecx, %eax, %edi ; FALLBACK30-NEXT: orl %edi, %esi ; FALLBACK30-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: addl %eax, %eax ; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi ; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK30-NEXT: shrxl %ecx, %ebp, %eax ; FALLBACK30-NEXT: movl 124(%esp,%edx), %edx ; FALLBACK30-NEXT: shrxl %ecx, %edx, %ebp ; FALLBACK30-NEXT: leal (%edx,%edx), %ecx ; FALLBACK30-NEXT: shlxl %ebx, %ecx, %edx ; FALLBACK30-NEXT: orl %eax, %edx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: movl %ebp, 60(%ecx) ; FALLBACK30-NEXT: movl %edx, 56(%ecx) ; FALLBACK30-NEXT: movl %edi, 48(%ecx) ; FALLBACK30-NEXT: movl %esi, 52(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 40(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 44(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 32(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 36(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 24(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 28(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 16(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 20(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 8(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 12(%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, (%ecx) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: movl %eax, 4(%ecx) ; FALLBACK30-NEXT: addl $204, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx ; FALLBACK30-NEXT: popl %ebp ; FALLBACK30-NEXT: vzeroupper ; FALLBACK30-NEXT: retl ; ; FALLBACK31-LABEL: lshr_64bytes: ; FALLBACK31: # %bb.0: ; FALLBACK31-NEXT: pushl %ebp ; FALLBACK31-NEXT: pushl %ebx ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi ; FALLBACK31-NEXT: subl $188, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK31-NEXT: vmovups (%ecx), %zmm0 ; FALLBACK31-NEXT: movl (%eax), %ecx ; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %ecx, %ebp ; FALLBACK31-NEXT: andl $60, %ebp ; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shll $3, %ecx ; FALLBACK31-NEXT: andl $24, %ecx ; FALLBACK31-NEXT: shrdl %cl, %edx, %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %esi ; FALLBACK31-NEXT: shrdl %cl, %edi, %esi ; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %esi, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %edi, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %esi ; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx ; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %esi, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl %edi, %edx ; FALLBACK31-NEXT: shrdl %cl, %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %edi, %esi ; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp ; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK31-NEXT: movl %eax, 56(%ebp) ; FALLBACK31-NEXT: movl %esi, 48(%ebp) ; FALLBACK31-NEXT: movl %edx, 52(%ebp) ; FALLBACK31-NEXT: movl %ebx, 40(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 44(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 32(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 36(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 24(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 28(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 16(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 20(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 8(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 12(%ebp) ; FALLBACK31-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK31-NEXT: shrdl %cl, %edx, %edi ; FALLBACK31-NEXT: movl %edi, (%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 4(%ebp) ; FALLBACK31-NEXT: movl %eax, 60(%ebp) ; FALLBACK31-NEXT: addl $188, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi ; FALLBACK31-NEXT: popl %ebx ; FALLBACK31-NEXT: popl %ebp ; FALLBACK31-NEXT: vzeroupper ; FALLBACK31-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 %res = lshr i512 %src, %bitOff store i512 %res, ptr %dst, align 1 ret void } define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: lshr_64bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rbx ; X64-SSE2-NEXT: movq (%rdi), %rax ; X64-SSE2-NEXT: movq 8(%rdi), %rcx ; X64-SSE2-NEXT: movq 16(%rdi), %r8 ; X64-SSE2-NEXT: movq 24(%rdi), %r9 ; X64-SSE2-NEXT: movq 32(%rdi), %r10 ; X64-SSE2-NEXT: movq 40(%rdi), %r11 ; X64-SSE2-NEXT: movq 48(%rdi), %rbx ; X64-SSE2-NEXT: movq 56(%rdi), %rdi ; X64-SSE2-NEXT: movl (%rsi), %esi ; X64-SSE2-NEXT: xorps %xmm0, %xmm0 ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: andl $7, %esi ; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax ; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx ; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi ; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8 ; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9 ; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10 ; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11 ; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi ; X64-SSE2-NEXT: movq %rsi, 48(%rdx) ; X64-SSE2-NEXT: movq %r11, 56(%rdx) ; X64-SSE2-NEXT: movq %r10, 32(%rdx) ; X64-SSE2-NEXT: movq %r9, 40(%rdx) ; X64-SSE2-NEXT: movq %r8, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) ; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: lshr_64bytes_qwordOff: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: pushq %rax ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2 ; X64-SSE42-NEXT: movups 48(%rdi), %xmm3 ; X64-SSE42-NEXT: movl (%rsi), %eax ; X64-SSE42-NEXT: xorps %xmm4, %xmm4 ; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: andl $7, %eax ; X64-SSE42-NEXT: movups -128(%rsp,%rax,8), %xmm0 ; X64-SSE42-NEXT: movups -112(%rsp,%rax,8), %xmm1 ; X64-SSE42-NEXT: movups -96(%rsp,%rax,8), %xmm2 ; X64-SSE42-NEXT: movups -80(%rsp,%rax,8), %xmm3 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: popq %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: lshr_64bytes_qwordOff: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: pushq %rax ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 ; X64-AVX1-NEXT: movl (%rsi), %eax ; X64-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: andl $7, %eax ; X64-AVX1-NEXT: vmovups -128(%rsp,%rax,8), %xmm0 ; X64-AVX1-NEXT: vmovups -112(%rsp,%rax,8), %xmm1 ; X64-AVX1-NEXT: vmovups -96(%rsp,%rax,8), %xmm2 ; X64-AVX1-NEXT: vmovups -80(%rsp,%rax,8), %xmm3 ; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) ; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX1-NEXT: popq %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; ; X64-AVX512-LABEL: lshr_64bytes_qwordOff: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: pushq %rax ; X64-AVX512-NEXT: vmovups (%rdi), %zmm0 ; X64-AVX512-NEXT: movl (%rsi), %eax ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: andl $7, %eax ; X64-AVX512-NEXT: vmovups -128(%rsp,%rax,8), %xmm0 ; X64-AVX512-NEXT: vmovups -112(%rsp,%rax,8), %xmm1 ; X64-AVX512-NEXT: vmovups -96(%rsp,%rax,8), %xmm2 ; X64-AVX512-NEXT: vmovups -80(%rsp,%rax,8), %xmm3 ; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) ; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX512-NEXT: popq %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; ; X86-SSE2-LABEL: lshr_64bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $188, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 12(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 16(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 20(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 24(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 28(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 32(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 36(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 40(%eax), %ebp ; X86-SSE2-NEXT: movl 44(%eax), %ebx ; X86-SSE2-NEXT: movl 48(%eax), %edi ; X86-SSE2-NEXT: movl 52(%eax), %esi ; X86-SSE2-NEXT: movl 56(%eax), %edx ; X86-SSE2-NEXT: movl 60(%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %eax ; X86-SSE2-NEXT: xorps %xmm0, %xmm0 ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andl $7, %eax ; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp ; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx ; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi ; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi ; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx ; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 56(%eax) ; X86-SSE2-NEXT: movl %edx, 60(%eax) ; X86-SSE2-NEXT: movl %esi, 48(%eax) ; X86-SSE2-NEXT: movl %edi, 52(%eax) ; X86-SSE2-NEXT: movl %ebx, 40(%eax) ; X86-SSE2-NEXT: movl %ebp, 44(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 16(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 20(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: addl $188, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: lshr_64bytes_qwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: subl $140, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movups 32(%edx), %xmm2 ; X86-SSE42-NEXT: movups 48(%edx), %xmm3 ; X86-SSE42-NEXT: movl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm4, %xmm4 ; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, (%esp) ; X86-SSE42-NEXT: andl $7, %ecx ; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0 ; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1 ; X86-SSE42-NEXT: movups 32(%esp,%ecx,8), %xmm2 ; X86-SSE42-NEXT: movups 48(%esp,%ecx,8), %xmm3 ; X86-SSE42-NEXT: movups %xmm3, 48(%eax) ; X86-SSE42-NEXT: movups %xmm2, 32(%eax) ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $140, %esp ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: lshr_64bytes_qwordOff: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: subl $140, %esp ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: vmovups (%edx), %ymm0 ; X86-AVX1-NEXT: vmovups 32(%edx), %ymm1 ; X86-AVX1-NEXT: movl (%ecx), %ecx ; X86-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm0, (%esp) ; X86-AVX1-NEXT: andl $7, %ecx ; X86-AVX1-NEXT: vmovups (%esp,%ecx,8), %xmm0 ; X86-AVX1-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 ; X86-AVX1-NEXT: vmovups 32(%esp,%ecx,8), %xmm2 ; X86-AVX1-NEXT: vmovups 48(%esp,%ecx,8), %xmm3 ; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax) ; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX1-NEXT: vmovups %xmm0, (%eax) ; X86-AVX1-NEXT: addl $140, %esp ; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX512-LABEL: lshr_64bytes_qwordOff: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: subl $140, %esp ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX512-NEXT: vmovups (%edx), %zmm0 ; X86-AVX512-NEXT: movl (%ecx), %ecx ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; X86-AVX512-NEXT: vmovups %zmm0, (%esp) ; X86-AVX512-NEXT: andl $7, %ecx ; X86-AVX512-NEXT: vmovups (%esp,%ecx,8), %xmm0 ; X86-AVX512-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 ; X86-AVX512-NEXT: vmovups 32(%esp,%ecx,8), %xmm2 ; X86-AVX512-NEXT: vmovups 48(%esp,%ecx,8), %xmm3 ; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax) ; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ; X86-AVX512-NEXT: addl $140, %esp ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %qwordOff = load i512, ptr %qwordOff.ptr, align 1 %bitOff = shl i512 %qwordOff, 6 %res = lshr i512 %src, %bitOff store i512 %res, ptr %dst, align 1 ret void } define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: shl_64bytes: ; FALLBACK0: # %bb.0: ; FALLBACK0-NEXT: pushq %r15 ; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %r13 ; FALLBACK0-NEXT: pushq %r12 ; FALLBACK0-NEXT: pushq %rbx ; FALLBACK0-NEXT: movq (%rdi), %rax ; FALLBACK0-NEXT: movq 8(%rdi), %rcx ; FALLBACK0-NEXT: movq 16(%rdi), %r8 ; FALLBACK0-NEXT: movq 24(%rdi), %r9 ; FALLBACK0-NEXT: movq 32(%rdi), %r10 ; FALLBACK0-NEXT: movq 40(%rdi), %r11 ; FALLBACK0-NEXT: movq 48(%rdi), %rbx ; FALLBACK0-NEXT: movq 56(%rdi), %rdi ; FALLBACK0-NEXT: movl (%rsi), %esi ; FALLBACK0-NEXT: xorps %xmm0, %xmm0 ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: leal (,%rsi,8), %eax ; FALLBACK0-NEXT: andl $56, %eax ; FALLBACK0-NEXT: andl $56, %esi ; FALLBACK0-NEXT: negl %esi ; FALLBACK0-NEXT: movslq %esi, %rbx ; FALLBACK0-NEXT: movq -64(%rsp,%rbx), %r8 ; FALLBACK0-NEXT: movq -56(%rsp,%rbx), %rdi ; FALLBACK0-NEXT: movq %rdi, %r10 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq %r8, %r9 ; FALLBACK0-NEXT: shrq %r9 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r9 ; FALLBACK0-NEXT: orq %r10, %r9 ; FALLBACK0-NEXT: movq -40(%rsp,%rbx), %r10 ; FALLBACK0-NEXT: movq %r10, %r14 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r14 ; FALLBACK0-NEXT: movq -48(%rsp,%rbx), %r15 ; FALLBACK0-NEXT: movq %r15, %r11 ; FALLBACK0-NEXT: shrq %r11 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: orq %r14, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r15 ; FALLBACK0-NEXT: shrq %rdi ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %rdi ; FALLBACK0-NEXT: orq %r15, %rdi ; FALLBACK0-NEXT: movq -24(%rsp,%rbx), %r14 ; FALLBACK0-NEXT: movq %r14, %r12 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r12 ; FALLBACK0-NEXT: movq -32(%rsp,%rbx), %r13 ; FALLBACK0-NEXT: movq %r13, %r15 ; FALLBACK0-NEXT: shrq %r15 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r15 ; FALLBACK0-NEXT: orq %r12, %r15 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r13 ; FALLBACK0-NEXT: shrq %r10 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r10 ; FALLBACK0-NEXT: orq %r13, %r10 ; FALLBACK0-NEXT: movq -8(%rsp,%rbx), %r12 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r12 ; FALLBACK0-NEXT: movq -16(%rsp,%rbx), %rbx ; FALLBACK0-NEXT: movq %rbx, %r13 ; FALLBACK0-NEXT: shrq %r13 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r13 ; FALLBACK0-NEXT: orq %r12, %r13 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %rbx ; FALLBACK0-NEXT: shrq %r14 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shrq %cl, %r14 ; FALLBACK0-NEXT: orq %rbx, %r14 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 ; FALLBACK0-NEXT: movq %r8, (%rdx) ; FALLBACK0-NEXT: movq %r14, 48(%rdx) ; FALLBACK0-NEXT: movq %r13, 56(%rdx) ; FALLBACK0-NEXT: movq %r10, 32(%rdx) ; FALLBACK0-NEXT: movq %r15, 40(%rdx) ; FALLBACK0-NEXT: movq %rdi, 16(%rdx) ; FALLBACK0-NEXT: movq %r11, 24(%rdx) ; FALLBACK0-NEXT: movq %r9, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: popq %r12 ; FALLBACK0-NEXT: popq %r13 ; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: popq %r15 ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: shl_64bytes: ; FALLBACK1: # %bb.0: ; FALLBACK1-NEXT: pushq %r14 ; FALLBACK1-NEXT: pushq %rbx ; FALLBACK1-NEXT: pushq %rax ; FALLBACK1-NEXT: movq (%rdi), %rax ; FALLBACK1-NEXT: movq 8(%rdi), %rcx ; FALLBACK1-NEXT: movq 16(%rdi), %r8 ; FALLBACK1-NEXT: movq 24(%rdi), %r9 ; FALLBACK1-NEXT: movq 32(%rdi), %r10 ; FALLBACK1-NEXT: movq 40(%rdi), %r11 ; FALLBACK1-NEXT: movq 48(%rdi), %rbx ; FALLBACK1-NEXT: movq 56(%rdi), %rdi ; FALLBACK1-NEXT: movl (%rsi), %esi ; FALLBACK1-NEXT: xorps %xmm0, %xmm0 ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: leal (,%rsi,8), %ecx ; FALLBACK1-NEXT: andl $56, %ecx ; FALLBACK1-NEXT: andl $56, %esi ; FALLBACK1-NEXT: negl %esi ; FALLBACK1-NEXT: movslq %esi, %r9 ; FALLBACK1-NEXT: movq -48(%rsp,%r9), %rax ; FALLBACK1-NEXT: movq -40(%rsp,%r9), %r10 ; FALLBACK1-NEXT: movq %r10, %rsi ; FALLBACK1-NEXT: shldq %cl, %rax, %rsi ; FALLBACK1-NEXT: movq -64(%rsp,%r9), %r8 ; FALLBACK1-NEXT: movq -56(%rsp,%r9), %rdi ; FALLBACK1-NEXT: shldq %cl, %rdi, %rax ; FALLBACK1-NEXT: movq -32(%rsp,%r9), %r11 ; FALLBACK1-NEXT: movq -24(%rsp,%r9), %rbx ; FALLBACK1-NEXT: movq %rbx, %r14 ; FALLBACK1-NEXT: shldq %cl, %r11, %r14 ; FALLBACK1-NEXT: shldq %cl, %r10, %r11 ; FALLBACK1-NEXT: movq -16(%rsp,%r9), %r10 ; FALLBACK1-NEXT: movq -8(%rsp,%r9), %r9 ; FALLBACK1-NEXT: shldq %cl, %r10, %r9 ; FALLBACK1-NEXT: shldq %cl, %rbx, %r10 ; FALLBACK1-NEXT: shldq %cl, %r8, %rdi ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK1-NEXT: shlq %cl, %r8 ; FALLBACK1-NEXT: movq %r10, 48(%rdx) ; FALLBACK1-NEXT: movq %r9, 56(%rdx) ; FALLBACK1-NEXT: movq %r11, 32(%rdx) ; FALLBACK1-NEXT: movq %r14, 40(%rdx) ; FALLBACK1-NEXT: movq %rax, 16(%rdx) ; FALLBACK1-NEXT: movq %rsi, 24(%rdx) ; FALLBACK1-NEXT: movq %r8, (%rdx) ; FALLBACK1-NEXT: movq %rdi, 8(%rdx) ; FALLBACK1-NEXT: addq $8, %rsp ; FALLBACK1-NEXT: popq %rbx ; FALLBACK1-NEXT: popq %r14 ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: shl_64bytes: ; FALLBACK2: # %bb.0: ; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 ; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax ; FALLBACK2-NEXT: movq (%rdi), %rax ; FALLBACK2-NEXT: movq 8(%rdi), %rcx ; FALLBACK2-NEXT: movq 16(%rdi), %r8 ; FALLBACK2-NEXT: movq 24(%rdi), %r9 ; FALLBACK2-NEXT: movq 32(%rdi), %r10 ; FALLBACK2-NEXT: movq 40(%rdi), %r11 ; FALLBACK2-NEXT: movq 48(%rdi), %rbx ; FALLBACK2-NEXT: movq 56(%rdi), %rdi ; FALLBACK2-NEXT: movl (%rsi), %esi ; FALLBACK2-NEXT: xorps %xmm0, %xmm0 ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rsi,8), %eax ; FALLBACK2-NEXT: andl $56, %eax ; FALLBACK2-NEXT: andl $56, %esi ; FALLBACK2-NEXT: negl %esi ; FALLBACK2-NEXT: movslq %esi, %rsi ; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10 ; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx ; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9 ; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 ; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14 ; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx ; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8 ; FALLBACK2-NEXT: shlxq %rax, %r8, %r15 ; FALLBACK2-NEXT: shlxq %rax, %r10, %r12 ; FALLBACK2-NEXT: movl %eax, %r13d ; FALLBACK2-NEXT: notb %r13b ; FALLBACK2-NEXT: shrq %r10 ; FALLBACK2-NEXT: shrxq %r13, %r10, %r10 ; FALLBACK2-NEXT: orq %r9, %r10 ; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9 ; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp ; FALLBACK2-NEXT: shrq %r14 ; FALLBACK2-NEXT: shrxq %r13, %r14, %r14 ; FALLBACK2-NEXT: orq %r11, %r14 ; FALLBACK2-NEXT: shlxq %rax, -8(%rsp,%rsi), %r11 ; FALLBACK2-NEXT: movq -16(%rsp,%rsi), %rsi ; FALLBACK2-NEXT: shlxq %rax, %rsi, %rax ; FALLBACK2-NEXT: shrq %rcx ; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx ; FALLBACK2-NEXT: orq %rbx, %rcx ; FALLBACK2-NEXT: shrq %r9 ; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 ; FALLBACK2-NEXT: orq %r15, %r9 ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi ; FALLBACK2-NEXT: orq %rbp, %rdi ; FALLBACK2-NEXT: shrq %rsi ; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi ; FALLBACK2-NEXT: orq %r11, %rsi ; FALLBACK2-NEXT: shrq %r8 ; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 ; FALLBACK2-NEXT: orq %rax, %r8 ; FALLBACK2-NEXT: movq %r12, (%rdx) ; FALLBACK2-NEXT: movq %r8, 48(%rdx) ; FALLBACK2-NEXT: movq %rsi, 56(%rdx) ; FALLBACK2-NEXT: movq %rdi, 32(%rdx) ; FALLBACK2-NEXT: movq %r9, 40(%rdx) ; FALLBACK2-NEXT: movq %rcx, 16(%rdx) ; FALLBACK2-NEXT: movq %r14, 24(%rdx) ; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 ; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 ; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: shl_64bytes: ; FALLBACK3: # %bb.0: ; FALLBACK3-NEXT: pushq %r14 ; FALLBACK3-NEXT: pushq %rbx ; FALLBACK3-NEXT: pushq %rax ; FALLBACK3-NEXT: movq (%rdi), %rax ; FALLBACK3-NEXT: movq 8(%rdi), %rcx ; FALLBACK3-NEXT: movq 16(%rdi), %r8 ; FALLBACK3-NEXT: movq 24(%rdi), %r9 ; FALLBACK3-NEXT: movq 32(%rdi), %r10 ; FALLBACK3-NEXT: movq 40(%rdi), %r11 ; FALLBACK3-NEXT: movq 48(%rdi), %rbx ; FALLBACK3-NEXT: movq 56(%rdi), %rdi ; FALLBACK3-NEXT: movl (%rsi), %esi ; FALLBACK3-NEXT: xorps %xmm0, %xmm0 ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: leal (,%rsi,8), %ecx ; FALLBACK3-NEXT: andl $56, %ecx ; FALLBACK3-NEXT: andl $56, %esi ; FALLBACK3-NEXT: negl %esi ; FALLBACK3-NEXT: movslq %esi, %r8 ; FALLBACK3-NEXT: movq -48(%rsp,%r8), %rax ; FALLBACK3-NEXT: movq -40(%rsp,%r8), %r9 ; FALLBACK3-NEXT: movq %r9, %rsi ; FALLBACK3-NEXT: shldq %cl, %rax, %rsi ; FALLBACK3-NEXT: movq -64(%rsp,%r8), %r10 ; FALLBACK3-NEXT: movq -56(%rsp,%r8), %rdi ; FALLBACK3-NEXT: shldq %cl, %rdi, %rax ; FALLBACK3-NEXT: movq -32(%rsp,%r8), %r11 ; FALLBACK3-NEXT: movq -24(%rsp,%r8), %rbx ; FALLBACK3-NEXT: movq %rbx, %r14 ; FALLBACK3-NEXT: shldq %cl, %r11, %r14 ; FALLBACK3-NEXT: shldq %cl, %r9, %r11 ; FALLBACK3-NEXT: movq -16(%rsp,%r8), %r9 ; FALLBACK3-NEXT: movq -8(%rsp,%r8), %r8 ; FALLBACK3-NEXT: shldq %cl, %r9, %r8 ; FALLBACK3-NEXT: shldq %cl, %rbx, %r9 ; FALLBACK3-NEXT: shldq %cl, %r10, %rdi ; FALLBACK3-NEXT: shlxq %rcx, %r10, %rcx ; FALLBACK3-NEXT: movq %r9, 48(%rdx) ; FALLBACK3-NEXT: movq %r8, 56(%rdx) ; FALLBACK3-NEXT: movq %r11, 32(%rdx) ; FALLBACK3-NEXT: movq %r14, 40(%rdx) ; FALLBACK3-NEXT: movq %rax, 16(%rdx) ; FALLBACK3-NEXT: movq %rsi, 24(%rdx) ; FALLBACK3-NEXT: movq %rcx, (%rdx) ; FALLBACK3-NEXT: movq %rdi, 8(%rdx) ; FALLBACK3-NEXT: addq $8, %rsp ; FALLBACK3-NEXT: popq %rbx ; FALLBACK3-NEXT: popq %r14 ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: shl_64bytes: ; FALLBACK4: # %bb.0: ; FALLBACK4-NEXT: pushq %r15 ; FALLBACK4-NEXT: pushq %r14 ; FALLBACK4-NEXT: pushq %r13 ; FALLBACK4-NEXT: pushq %r12 ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK4-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK4-NEXT: movl (%rsi), %ecx ; FALLBACK4-NEXT: xorps %xmm4, %xmm4 ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: leal (,%rcx,8), %eax ; FALLBACK4-NEXT: andl $56, %eax ; FALLBACK4-NEXT: andl $56, %ecx ; FALLBACK4-NEXT: negl %ecx ; FALLBACK4-NEXT: movslq %ecx, %r9 ; FALLBACK4-NEXT: movq -24(%rsp,%r9), %rdi ; FALLBACK4-NEXT: movq %rdi, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: movq -32(%rsp,%r9), %r11 ; FALLBACK4-NEXT: movq %r11, %r8 ; FALLBACK4-NEXT: shrq %r8 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r8 ; FALLBACK4-NEXT: orq %r10, %r8 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r11 ; FALLBACK4-NEXT: movq -40(%rsp,%r9), %rbx ; FALLBACK4-NEXT: movq %rbx, %r10 ; FALLBACK4-NEXT: shrq %r10 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: orq %r11, %r10 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %rbx ; FALLBACK4-NEXT: movq -48(%rsp,%r9), %r15 ; FALLBACK4-NEXT: movq %r15, %r11 ; FALLBACK4-NEXT: shrq %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r11 ; FALLBACK4-NEXT: orq %rbx, %r11 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r15 ; FALLBACK4-NEXT: movq -64(%rsp,%r9), %r14 ; FALLBACK4-NEXT: movq -56(%rsp,%r9), %r12 ; FALLBACK4-NEXT: movq %r12, %rbx ; FALLBACK4-NEXT: shrq %rbx ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %rbx ; FALLBACK4-NEXT: orq %r15, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r12 ; FALLBACK4-NEXT: movq %r14, %r15 ; FALLBACK4-NEXT: shrq %r15 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r15 ; FALLBACK4-NEXT: orq %r12, %r15 ; FALLBACK4-NEXT: movq -16(%rsp,%r9), %r12 ; FALLBACK4-NEXT: movq %r12, %r13 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r13 ; FALLBACK4-NEXT: shrq %rdi ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %rdi ; FALLBACK4-NEXT: orq %r13, %rdi ; FALLBACK4-NEXT: movq -8(%rsp,%r9), %r9 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r9 ; FALLBACK4-NEXT: shrq %r12 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shrq %cl, %r12 ; FALLBACK4-NEXT: orq %r9, %r12 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r14 ; FALLBACK4-NEXT: movq %r14, (%rdx) ; FALLBACK4-NEXT: movq %r12, 56(%rdx) ; FALLBACK4-NEXT: movq %rdi, 48(%rdx) ; FALLBACK4-NEXT: movq %r15, 8(%rdx) ; FALLBACK4-NEXT: movq %rbx, 16(%rdx) ; FALLBACK4-NEXT: movq %r11, 24(%rdx) ; FALLBACK4-NEXT: movq %r10, 32(%rdx) ; FALLBACK4-NEXT: movq %r8, 40(%rdx) ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: popq %r12 ; FALLBACK4-NEXT: popq %r13 ; FALLBACK4-NEXT: popq %r14 ; FALLBACK4-NEXT: popq %r15 ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: shl_64bytes: ; FALLBACK5: # %bb.0: ; FALLBACK5-NEXT: pushq %r15 ; FALLBACK5-NEXT: pushq %r14 ; FALLBACK5-NEXT: pushq %rbx ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK5-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK5-NEXT: movl (%rsi), %eax ; FALLBACK5-NEXT: xorps %xmm4, %xmm4 ; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: leal (,%rax,8), %ecx ; FALLBACK5-NEXT: andl $56, %ecx ; FALLBACK5-NEXT: andl $56, %eax ; FALLBACK5-NEXT: negl %eax ; FALLBACK5-NEXT: movslq %eax, %r8 ; FALLBACK5-NEXT: movq -32(%rsp,%r8), %rax ; FALLBACK5-NEXT: movq -24(%rsp,%r8), %r9 ; FALLBACK5-NEXT: movq %r9, %rsi ; FALLBACK5-NEXT: shldq %cl, %rax, %rsi ; FALLBACK5-NEXT: movq -40(%rsp,%r8), %rdi ; FALLBACK5-NEXT: shldq %cl, %rdi, %rax ; FALLBACK5-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK5-NEXT: shldq %cl, %r10, %rdi ; FALLBACK5-NEXT: movq -64(%rsp,%r8), %r11 ; FALLBACK5-NEXT: movq -56(%rsp,%r8), %rbx ; FALLBACK5-NEXT: shldq %cl, %rbx, %r10 ; FALLBACK5-NEXT: movq -16(%rsp,%r8), %r14 ; FALLBACK5-NEXT: movq %r14, %r15 ; FALLBACK5-NEXT: shldq %cl, %r9, %r15 ; FALLBACK5-NEXT: movq -8(%rsp,%r8), %r8 ; FALLBACK5-NEXT: shldq %cl, %r14, %r8 ; FALLBACK5-NEXT: movq %r11, %r9 ; FALLBACK5-NEXT: shlq %cl, %r9 ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: shldq %cl, %r11, %rbx ; FALLBACK5-NEXT: movq %r8, 56(%rdx) ; FALLBACK5-NEXT: movq %r15, 48(%rdx) ; FALLBACK5-NEXT: movq %rbx, 8(%rdx) ; FALLBACK5-NEXT: movq %r10, 16(%rdx) ; FALLBACK5-NEXT: movq %rdi, 24(%rdx) ; FALLBACK5-NEXT: movq %rax, 32(%rdx) ; FALLBACK5-NEXT: movq %rsi, 40(%rdx) ; FALLBACK5-NEXT: movq %r9, (%rdx) ; FALLBACK5-NEXT: popq %rbx ; FALLBACK5-NEXT: popq %r14 ; FALLBACK5-NEXT: popq %r15 ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: shl_64bytes: ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx ; FALLBACK6-NEXT: subq $24, %rsp ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK6-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK6-NEXT: movl (%rsi), %eax ; FALLBACK6-NEXT: xorps %xmm4, %xmm4 ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm3, (%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: leal (,%rax,8), %ecx ; FALLBACK6-NEXT: andl $56, %ecx ; FALLBACK6-NEXT: andl $56, %eax ; FALLBACK6-NEXT: negl %eax ; FALLBACK6-NEXT: movslq %eax, %rsi ; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax ; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12 ; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi ; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15 ; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13 ; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8 ; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK6-NEXT: movq -32(%rsp,%rsi), %r11 ; FALLBACK6-NEXT: shlxq %rcx, %r11, %r10 ; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14 ; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx ; FALLBACK6-NEXT: movl %ecx, %r9d ; FALLBACK6-NEXT: notb %r9b ; FALLBACK6-NEXT: shrq %rdi ; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi ; FALLBACK6-NEXT: orq %r12, %rdi ; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp ; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8 ; FALLBACK6-NEXT: shrq %r13 ; FALLBACK6-NEXT: shrxq %r9, %r13, %r12 ; FALLBACK6-NEXT: orq %r15, %r12 ; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 ; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi ; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK6-NEXT: shrq %r11 ; FALLBACK6-NEXT: shrxq %r9, %r11, %r11 ; FALLBACK6-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; FALLBACK6-NEXT: shrq %r14 ; FALLBACK6-NEXT: shrxq %r9, %r14, %r14 ; FALLBACK6-NEXT: orq %r10, %r14 ; FALLBACK6-NEXT: shrq %rsi ; FALLBACK6-NEXT: shrxq %r9, %rsi, %rsi ; FALLBACK6-NEXT: orq %rbx, %rsi ; FALLBACK6-NEXT: shrq %rax ; FALLBACK6-NEXT: shrxq %r9, %rax, %rax ; FALLBACK6-NEXT: orq %r8, %rax ; FALLBACK6-NEXT: shrq %rbp ; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8 ; FALLBACK6-NEXT: orq %r15, %r8 ; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %r8, 56(%rdx) ; FALLBACK6-NEXT: movq %rax, 48(%rdx) ; FALLBACK6-NEXT: movq %rsi, 8(%rdx) ; FALLBACK6-NEXT: movq %r14, 16(%rdx) ; FALLBACK6-NEXT: movq %r11, 24(%rdx) ; FALLBACK6-NEXT: movq %r12, 32(%rdx) ; FALLBACK6-NEXT: movq %rdi, 40(%rdx) ; FALLBACK6-NEXT: addq $24, %rsp ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 ; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: shl_64bytes: ; FALLBACK7: # %bb.0: ; FALLBACK7-NEXT: pushq %r15 ; FALLBACK7-NEXT: pushq %r14 ; FALLBACK7-NEXT: pushq %rbx ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK7-NEXT: movups 48(%rdi), %xmm3 ; FALLBACK7-NEXT: movl (%rsi), %eax ; FALLBACK7-NEXT: xorps %xmm4, %xmm4 ; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: leal (,%rax,8), %ecx ; FALLBACK7-NEXT: andl $56, %ecx ; FALLBACK7-NEXT: andl $56, %eax ; FALLBACK7-NEXT: negl %eax ; FALLBACK7-NEXT: movslq %eax, %r8 ; FALLBACK7-NEXT: movq -32(%rsp,%r8), %rax ; FALLBACK7-NEXT: movq -24(%rsp,%r8), %r9 ; FALLBACK7-NEXT: movq %r9, %rsi ; FALLBACK7-NEXT: shldq %cl, %rax, %rsi ; FALLBACK7-NEXT: movq -40(%rsp,%r8), %rdi ; FALLBACK7-NEXT: shldq %cl, %rdi, %rax ; FALLBACK7-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK7-NEXT: shldq %cl, %r10, %rdi ; FALLBACK7-NEXT: movq -64(%rsp,%r8), %r11 ; FALLBACK7-NEXT: movq -56(%rsp,%r8), %rbx ; FALLBACK7-NEXT: shldq %cl, %rbx, %r10 ; FALLBACK7-NEXT: movq -16(%rsp,%r8), %r14 ; FALLBACK7-NEXT: movq %r14, %r15 ; FALLBACK7-NEXT: shldq %cl, %r9, %r15 ; FALLBACK7-NEXT: movq -8(%rsp,%r8), %r8 ; FALLBACK7-NEXT: shldq %cl, %r14, %r8 ; FALLBACK7-NEXT: shlxq %rcx, %r11, %r9 ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK7-NEXT: shldq %cl, %r11, %rbx ; FALLBACK7-NEXT: movq %r8, 56(%rdx) ; FALLBACK7-NEXT: movq %r15, 48(%rdx) ; FALLBACK7-NEXT: movq %rbx, 8(%rdx) ; FALLBACK7-NEXT: movq %r10, 16(%rdx) ; FALLBACK7-NEXT: movq %rdi, 24(%rdx) ; FALLBACK7-NEXT: movq %rax, 32(%rdx) ; FALLBACK7-NEXT: movq %rsi, 40(%rdx) ; FALLBACK7-NEXT: movq %r9, (%rdx) ; FALLBACK7-NEXT: popq %rbx ; FALLBACK7-NEXT: popq %r14 ; FALLBACK7-NEXT: popq %r15 ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: shl_64bytes: ; FALLBACK8: # %bb.0: ; FALLBACK8-NEXT: pushq %r15 ; FALLBACK8-NEXT: pushq %r14 ; FALLBACK8-NEXT: pushq %r13 ; FALLBACK8-NEXT: pushq %r12 ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK8-NEXT: movl (%rsi), %ecx ; FALLBACK8-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: leal (,%rcx,8), %eax ; FALLBACK8-NEXT: andl $56, %eax ; FALLBACK8-NEXT: andl $56, %ecx ; FALLBACK8-NEXT: negl %ecx ; FALLBACK8-NEXT: movslq %ecx, %r9 ; FALLBACK8-NEXT: movq -24(%rsp,%r9), %rdi ; FALLBACK8-NEXT: movq %rdi, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: movq -32(%rsp,%r9), %r11 ; FALLBACK8-NEXT: movq %r11, %r8 ; FALLBACK8-NEXT: shrq %r8 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r8 ; FALLBACK8-NEXT: orq %r10, %r8 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r11 ; FALLBACK8-NEXT: movq -40(%rsp,%r9), %rbx ; FALLBACK8-NEXT: movq %rbx, %r10 ; FALLBACK8-NEXT: shrq %r10 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: orq %r11, %r10 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %rbx ; FALLBACK8-NEXT: movq -48(%rsp,%r9), %r15 ; FALLBACK8-NEXT: movq %r15, %r11 ; FALLBACK8-NEXT: shrq %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r11 ; FALLBACK8-NEXT: orq %rbx, %r11 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r15 ; FALLBACK8-NEXT: movq -64(%rsp,%r9), %r14 ; FALLBACK8-NEXT: movq -56(%rsp,%r9), %r12 ; FALLBACK8-NEXT: movq %r12, %rbx ; FALLBACK8-NEXT: shrq %rbx ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %rbx ; FALLBACK8-NEXT: orq %r15, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r12 ; FALLBACK8-NEXT: movq %r14, %r15 ; FALLBACK8-NEXT: shrq %r15 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r15 ; FALLBACK8-NEXT: orq %r12, %r15 ; FALLBACK8-NEXT: movq -16(%rsp,%r9), %r12 ; FALLBACK8-NEXT: movq %r12, %r13 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r13 ; FALLBACK8-NEXT: shrq %rdi ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %rdi ; FALLBACK8-NEXT: orq %r13, %rdi ; FALLBACK8-NEXT: movq -8(%rsp,%r9), %r9 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r9 ; FALLBACK8-NEXT: shrq %r12 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shrq %cl, %r12 ; FALLBACK8-NEXT: orq %r9, %r12 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r14 ; FALLBACK8-NEXT: movq %r14, (%rdx) ; FALLBACK8-NEXT: movq %r12, 56(%rdx) ; FALLBACK8-NEXT: movq %rdi, 48(%rdx) ; FALLBACK8-NEXT: movq %r15, 8(%rdx) ; FALLBACK8-NEXT: movq %rbx, 16(%rdx) ; FALLBACK8-NEXT: movq %r11, 24(%rdx) ; FALLBACK8-NEXT: movq %r10, 32(%rdx) ; FALLBACK8-NEXT: movq %r8, 40(%rdx) ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: popq %r12 ; FALLBACK8-NEXT: popq %r13 ; FALLBACK8-NEXT: popq %r14 ; FALLBACK8-NEXT: popq %r15 ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: shl_64bytes: ; FALLBACK9: # %bb.0: ; FALLBACK9-NEXT: pushq %r15 ; FALLBACK9-NEXT: pushq %r14 ; FALLBACK9-NEXT: pushq %rbx ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK9-NEXT: movl (%rsi), %eax ; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: leal (,%rax,8), %ecx ; FALLBACK9-NEXT: andl $56, %ecx ; FALLBACK9-NEXT: andl $56, %eax ; FALLBACK9-NEXT: negl %eax ; FALLBACK9-NEXT: movslq %eax, %r8 ; FALLBACK9-NEXT: movq -32(%rsp,%r8), %rax ; FALLBACK9-NEXT: movq -24(%rsp,%r8), %r9 ; FALLBACK9-NEXT: movq %r9, %rsi ; FALLBACK9-NEXT: shldq %cl, %rax, %rsi ; FALLBACK9-NEXT: movq -40(%rsp,%r8), %rdi ; FALLBACK9-NEXT: shldq %cl, %rdi, %rax ; FALLBACK9-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK9-NEXT: shldq %cl, %r10, %rdi ; FALLBACK9-NEXT: movq -64(%rsp,%r8), %r11 ; FALLBACK9-NEXT: movq -56(%rsp,%r8), %rbx ; FALLBACK9-NEXT: shldq %cl, %rbx, %r10 ; FALLBACK9-NEXT: movq -16(%rsp,%r8), %r14 ; FALLBACK9-NEXT: movq %r14, %r15 ; FALLBACK9-NEXT: shldq %cl, %r9, %r15 ; FALLBACK9-NEXT: movq -8(%rsp,%r8), %r8 ; FALLBACK9-NEXT: shldq %cl, %r14, %r8 ; FALLBACK9-NEXT: movq %r11, %r9 ; FALLBACK9-NEXT: shlq %cl, %r9 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shldq %cl, %r11, %rbx ; FALLBACK9-NEXT: movq %r8, 56(%rdx) ; FALLBACK9-NEXT: movq %r15, 48(%rdx) ; FALLBACK9-NEXT: movq %rbx, 8(%rdx) ; FALLBACK9-NEXT: movq %r10, 16(%rdx) ; FALLBACK9-NEXT: movq %rdi, 24(%rdx) ; FALLBACK9-NEXT: movq %rax, 32(%rdx) ; FALLBACK9-NEXT: movq %rsi, 40(%rdx) ; FALLBACK9-NEXT: movq %r9, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 ; FALLBACK9-NEXT: popq %r15 ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: shl_64bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx ; FALLBACK10-NEXT: subq $24, %rsp ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK10-NEXT: movl (%rsi), %eax ; FALLBACK10-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: leal (,%rax,8), %ecx ; FALLBACK10-NEXT: andl $56, %ecx ; FALLBACK10-NEXT: andl $56, %eax ; FALLBACK10-NEXT: negl %eax ; FALLBACK10-NEXT: movslq %eax, %rsi ; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax ; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12 ; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi ; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15 ; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13 ; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8 ; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK10-NEXT: movq -32(%rsp,%rsi), %r11 ; FALLBACK10-NEXT: shlxq %rcx, %r11, %r10 ; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14 ; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx ; FALLBACK10-NEXT: movl %ecx, %r9d ; FALLBACK10-NEXT: notb %r9b ; FALLBACK10-NEXT: shrq %rdi ; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi ; FALLBACK10-NEXT: orq %r12, %rdi ; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp ; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8 ; FALLBACK10-NEXT: shrq %r13 ; FALLBACK10-NEXT: shrxq %r9, %r13, %r12 ; FALLBACK10-NEXT: orq %r15, %r12 ; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 ; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi ; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK10-NEXT: shrq %r11 ; FALLBACK10-NEXT: shrxq %r9, %r11, %r11 ; FALLBACK10-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; FALLBACK10-NEXT: shrq %r14 ; FALLBACK10-NEXT: shrxq %r9, %r14, %r14 ; FALLBACK10-NEXT: orq %r10, %r14 ; FALLBACK10-NEXT: shrq %rsi ; FALLBACK10-NEXT: shrxq %r9, %rsi, %rsi ; FALLBACK10-NEXT: orq %rbx, %rsi ; FALLBACK10-NEXT: shrq %rax ; FALLBACK10-NEXT: shrxq %r9, %rax, %rax ; FALLBACK10-NEXT: orq %r8, %rax ; FALLBACK10-NEXT: shrq %rbp ; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8 ; FALLBACK10-NEXT: orq %r15, %r8 ; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %r8, 56(%rdx) ; FALLBACK10-NEXT: movq %rax, 48(%rdx) ; FALLBACK10-NEXT: movq %rsi, 8(%rdx) ; FALLBACK10-NEXT: movq %r14, 16(%rdx) ; FALLBACK10-NEXT: movq %r11, 24(%rdx) ; FALLBACK10-NEXT: movq %r12, 32(%rdx) ; FALLBACK10-NEXT: movq %rdi, 40(%rdx) ; FALLBACK10-NEXT: addq $24, %rsp ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 ; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: shl_64bytes: ; FALLBACK11: # %bb.0: ; FALLBACK11-NEXT: pushq %r15 ; FALLBACK11-NEXT: pushq %r14 ; FALLBACK11-NEXT: pushq %rbx ; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 ; FALLBACK11-NEXT: movl (%rsi), %eax ; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: leal (,%rax,8), %ecx ; FALLBACK11-NEXT: andl $56, %ecx ; FALLBACK11-NEXT: andl $56, %eax ; FALLBACK11-NEXT: negl %eax ; FALLBACK11-NEXT: movslq %eax, %r8 ; FALLBACK11-NEXT: movq -32(%rsp,%r8), %rax ; FALLBACK11-NEXT: movq -24(%rsp,%r8), %r9 ; FALLBACK11-NEXT: movq %r9, %rsi ; FALLBACK11-NEXT: shldq %cl, %rax, %rsi ; FALLBACK11-NEXT: movq -40(%rsp,%r8), %rdi ; FALLBACK11-NEXT: shldq %cl, %rdi, %rax ; FALLBACK11-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK11-NEXT: shldq %cl, %r10, %rdi ; FALLBACK11-NEXT: movq -64(%rsp,%r8), %r11 ; FALLBACK11-NEXT: movq -56(%rsp,%r8), %rbx ; FALLBACK11-NEXT: shldq %cl, %rbx, %r10 ; FALLBACK11-NEXT: movq -16(%rsp,%r8), %r14 ; FALLBACK11-NEXT: movq %r14, %r15 ; FALLBACK11-NEXT: shldq %cl, %r9, %r15 ; FALLBACK11-NEXT: movq -8(%rsp,%r8), %r8 ; FALLBACK11-NEXT: shldq %cl, %r14, %r8 ; FALLBACK11-NEXT: shlxq %rcx, %r11, %r9 ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK11-NEXT: shldq %cl, %r11, %rbx ; FALLBACK11-NEXT: movq %r8, 56(%rdx) ; FALLBACK11-NEXT: movq %r15, 48(%rdx) ; FALLBACK11-NEXT: movq %rbx, 8(%rdx) ; FALLBACK11-NEXT: movq %r10, 16(%rdx) ; FALLBACK11-NEXT: movq %rdi, 24(%rdx) ; FALLBACK11-NEXT: movq %rax, 32(%rdx) ; FALLBACK11-NEXT: movq %rsi, 40(%rdx) ; FALLBACK11-NEXT: movq %r9, (%rdx) ; FALLBACK11-NEXT: popq %rbx ; FALLBACK11-NEXT: popq %r14 ; FALLBACK11-NEXT: popq %r15 ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: shl_64bytes: ; FALLBACK12: # %bb.0: ; FALLBACK12-NEXT: pushq %r15 ; FALLBACK12-NEXT: pushq %r14 ; FALLBACK12-NEXT: pushq %r13 ; FALLBACK12-NEXT: pushq %r12 ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK12-NEXT: movl (%rsi), %ecx ; FALLBACK12-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: leal (,%rcx,8), %eax ; FALLBACK12-NEXT: andl $56, %eax ; FALLBACK12-NEXT: andl $56, %ecx ; FALLBACK12-NEXT: negl %ecx ; FALLBACK12-NEXT: movslq %ecx, %r9 ; FALLBACK12-NEXT: movq -24(%rsp,%r9), %rdi ; FALLBACK12-NEXT: movq %rdi, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: movq -32(%rsp,%r9), %r11 ; FALLBACK12-NEXT: movq %r11, %r8 ; FALLBACK12-NEXT: shrq %r8 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r8 ; FALLBACK12-NEXT: orq %r10, %r8 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 ; FALLBACK12-NEXT: movq -40(%rsp,%r9), %rbx ; FALLBACK12-NEXT: movq %rbx, %r10 ; FALLBACK12-NEXT: shrq %r10 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: orq %r11, %r10 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: movq -48(%rsp,%r9), %r15 ; FALLBACK12-NEXT: movq %r15, %r11 ; FALLBACK12-NEXT: shrq %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r11 ; FALLBACK12-NEXT: orq %rbx, %r11 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r15 ; FALLBACK12-NEXT: movq -64(%rsp,%r9), %r14 ; FALLBACK12-NEXT: movq -56(%rsp,%r9), %r12 ; FALLBACK12-NEXT: movq %r12, %rbx ; FALLBACK12-NEXT: shrq %rbx ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbx ; FALLBACK12-NEXT: orq %r15, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: movq %r14, %r15 ; FALLBACK12-NEXT: shrq %r15 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r15 ; FALLBACK12-NEXT: orq %r12, %r15 ; FALLBACK12-NEXT: movq -16(%rsp,%r9), %r12 ; FALLBACK12-NEXT: movq %r12, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r13 ; FALLBACK12-NEXT: shrq %rdi ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %rdi ; FALLBACK12-NEXT: orq %r13, %rdi ; FALLBACK12-NEXT: movq -8(%rsp,%r9), %r9 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r9 ; FALLBACK12-NEXT: shrq %r12 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shrq %cl, %r12 ; FALLBACK12-NEXT: orq %r9, %r12 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r14 ; FALLBACK12-NEXT: movq %r14, (%rdx) ; FALLBACK12-NEXT: movq %r12, 56(%rdx) ; FALLBACK12-NEXT: movq %rdi, 48(%rdx) ; FALLBACK12-NEXT: movq %r15, 8(%rdx) ; FALLBACK12-NEXT: movq %rbx, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %r10, 32(%rdx) ; FALLBACK12-NEXT: movq %r8, 40(%rdx) ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: popq %r12 ; FALLBACK12-NEXT: popq %r13 ; FALLBACK12-NEXT: popq %r14 ; FALLBACK12-NEXT: popq %r15 ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: shl_64bytes: ; FALLBACK13: # %bb.0: ; FALLBACK13-NEXT: pushq %r15 ; FALLBACK13-NEXT: pushq %r14 ; FALLBACK13-NEXT: pushq %rbx ; FALLBACK13-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK13-NEXT: movl (%rsi), %eax ; FALLBACK13-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: leal (,%rax,8), %ecx ; FALLBACK13-NEXT: andl $56, %ecx ; FALLBACK13-NEXT: andl $56, %eax ; FALLBACK13-NEXT: negl %eax ; FALLBACK13-NEXT: movslq %eax, %r8 ; FALLBACK13-NEXT: movq -32(%rsp,%r8), %rax ; FALLBACK13-NEXT: movq -24(%rsp,%r8), %r9 ; FALLBACK13-NEXT: movq %r9, %rsi ; FALLBACK13-NEXT: shldq %cl, %rax, %rsi ; FALLBACK13-NEXT: movq -40(%rsp,%r8), %rdi ; FALLBACK13-NEXT: shldq %cl, %rdi, %rax ; FALLBACK13-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK13-NEXT: shldq %cl, %r10, %rdi ; FALLBACK13-NEXT: movq -64(%rsp,%r8), %r11 ; FALLBACK13-NEXT: movq -56(%rsp,%r8), %rbx ; FALLBACK13-NEXT: shldq %cl, %rbx, %r10 ; FALLBACK13-NEXT: movq -16(%rsp,%r8), %r14 ; FALLBACK13-NEXT: movq %r14, %r15 ; FALLBACK13-NEXT: shldq %cl, %r9, %r15 ; FALLBACK13-NEXT: movq -8(%rsp,%r8), %r8 ; FALLBACK13-NEXT: shldq %cl, %r14, %r8 ; FALLBACK13-NEXT: movq %r11, %r9 ; FALLBACK13-NEXT: shlq %cl, %r9 ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: shldq %cl, %r11, %rbx ; FALLBACK13-NEXT: movq %r8, 56(%rdx) ; FALLBACK13-NEXT: movq %r15, 48(%rdx) ; FALLBACK13-NEXT: movq %rbx, 8(%rdx) ; FALLBACK13-NEXT: movq %r10, 16(%rdx) ; FALLBACK13-NEXT: movq %rdi, 24(%rdx) ; FALLBACK13-NEXT: movq %rax, 32(%rdx) ; FALLBACK13-NEXT: movq %rsi, 40(%rdx) ; FALLBACK13-NEXT: movq %r9, (%rdx) ; FALLBACK13-NEXT: popq %rbx ; FALLBACK13-NEXT: popq %r14 ; FALLBACK13-NEXT: popq %r15 ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: shl_64bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx ; FALLBACK14-NEXT: subq $24, %rsp ; FALLBACK14-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK14-NEXT: movl (%rsi), %eax ; FALLBACK14-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: leal (,%rax,8), %ecx ; FALLBACK14-NEXT: andl $56, %ecx ; FALLBACK14-NEXT: andl $56, %eax ; FALLBACK14-NEXT: negl %eax ; FALLBACK14-NEXT: movslq %eax, %rsi ; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax ; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12 ; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15 ; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13 ; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8 ; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; FALLBACK14-NEXT: movq -32(%rsp,%rsi), %r11 ; FALLBACK14-NEXT: shlxq %rcx, %r11, %r10 ; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14 ; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx ; FALLBACK14-NEXT: movl %ecx, %r9d ; FALLBACK14-NEXT: notb %r9b ; FALLBACK14-NEXT: shrq %rdi ; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi ; FALLBACK14-NEXT: orq %r12, %rdi ; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp ; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8 ; FALLBACK14-NEXT: shrq %r13 ; FALLBACK14-NEXT: shrxq %r9, %r13, %r12 ; FALLBACK14-NEXT: orq %r15, %r12 ; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 ; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi ; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK14-NEXT: shrq %r11 ; FALLBACK14-NEXT: shrxq %r9, %r11, %r11 ; FALLBACK14-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload ; FALLBACK14-NEXT: shrq %r14 ; FALLBACK14-NEXT: shrxq %r9, %r14, %r14 ; FALLBACK14-NEXT: orq %r10, %r14 ; FALLBACK14-NEXT: shrq %rsi ; FALLBACK14-NEXT: shrxq %r9, %rsi, %rsi ; FALLBACK14-NEXT: orq %rbx, %rsi ; FALLBACK14-NEXT: shrq %rax ; FALLBACK14-NEXT: shrxq %r9, %rax, %rax ; FALLBACK14-NEXT: orq %r8, %rax ; FALLBACK14-NEXT: shrq %rbp ; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8 ; FALLBACK14-NEXT: orq %r15, %r8 ; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %r8, 56(%rdx) ; FALLBACK14-NEXT: movq %rax, 48(%rdx) ; FALLBACK14-NEXT: movq %rsi, 8(%rdx) ; FALLBACK14-NEXT: movq %r14, 16(%rdx) ; FALLBACK14-NEXT: movq %r11, 24(%rdx) ; FALLBACK14-NEXT: movq %r12, 32(%rdx) ; FALLBACK14-NEXT: movq %rdi, 40(%rdx) ; FALLBACK14-NEXT: addq $24, %rsp ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 ; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: shl_64bytes: ; FALLBACK15: # %bb.0: ; FALLBACK15-NEXT: pushq %r15 ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx ; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 ; FALLBACK15-NEXT: movl (%rsi), %eax ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: leal (,%rax,8), %ecx ; FALLBACK15-NEXT: andl $56, %ecx ; FALLBACK15-NEXT: andl $56, %eax ; FALLBACK15-NEXT: negl %eax ; FALLBACK15-NEXT: movslq %eax, %r8 ; FALLBACK15-NEXT: movq -32(%rsp,%r8), %rax ; FALLBACK15-NEXT: movq -24(%rsp,%r8), %r9 ; FALLBACK15-NEXT: movq %r9, %rsi ; FALLBACK15-NEXT: shldq %cl, %rax, %rsi ; FALLBACK15-NEXT: movq -40(%rsp,%r8), %rdi ; FALLBACK15-NEXT: shldq %cl, %rdi, %rax ; FALLBACK15-NEXT: movq -48(%rsp,%r8), %r10 ; FALLBACK15-NEXT: shldq %cl, %r10, %rdi ; FALLBACK15-NEXT: movq -64(%rsp,%r8), %r11 ; FALLBACK15-NEXT: movq -56(%rsp,%r8), %rbx ; FALLBACK15-NEXT: shldq %cl, %rbx, %r10 ; FALLBACK15-NEXT: movq -16(%rsp,%r8), %r14 ; FALLBACK15-NEXT: movq %r14, %r15 ; FALLBACK15-NEXT: shldq %cl, %r9, %r15 ; FALLBACK15-NEXT: movq -8(%rsp,%r8), %r8 ; FALLBACK15-NEXT: shldq %cl, %r14, %r8 ; FALLBACK15-NEXT: shlxq %rcx, %r11, %r9 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK15-NEXT: shldq %cl, %r11, %rbx ; FALLBACK15-NEXT: movq %r8, 56(%rdx) ; FALLBACK15-NEXT: movq %r15, 48(%rdx) ; FALLBACK15-NEXT: movq %rbx, 8(%rdx) ; FALLBACK15-NEXT: movq %r10, 16(%rdx) ; FALLBACK15-NEXT: movq %rdi, 24(%rdx) ; FALLBACK15-NEXT: movq %rax, 32(%rdx) ; FALLBACK15-NEXT: movq %rsi, 40(%rdx) ; FALLBACK15-NEXT: movq %r9, (%rdx) ; FALLBACK15-NEXT: popq %rbx ; FALLBACK15-NEXT: popq %r14 ; FALLBACK15-NEXT: popq %r15 ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; ; FALLBACK16-LABEL: shl_64bytes: ; FALLBACK16: # %bb.0: ; FALLBACK16-NEXT: pushl %ebp ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $204, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl (%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 4(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 8(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 12(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 16(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 20(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 24(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 28(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 32(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 36(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 40(%eax), %ebp ; FALLBACK16-NEXT: movl 44(%eax), %ebx ; FALLBACK16-NEXT: movl 48(%eax), %edi ; FALLBACK16-NEXT: movl 52(%eax), %esi ; FALLBACK16-NEXT: movl 56(%eax), %edx ; FALLBACK16-NEXT: movl 60(%eax), %ecx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl (%eax), %eax ; FALLBACK16-NEXT: xorps %xmm0, %xmm0 ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %eax, %edx ; FALLBACK16-NEXT: andl $60, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx ; FALLBACK16-NEXT: subl %edx, %ecx ; FALLBACK16-NEXT: movl (%ecx), %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 4(%ecx), %edx ; FALLBACK16-NEXT: movl %ecx, %ebp ; FALLBACK16-NEXT: shll $3, %eax ; FALLBACK16-NEXT: andl $24, %eax ; FALLBACK16-NEXT: movl %edx, %esi ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: shrl %edi ; FALLBACK16-NEXT: movb %al, %ch ; FALLBACK16-NEXT: notb %ch ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: orl %esi, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 12(%ebp), %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: movl 8(%ebp), %esi ; FALLBACK16-NEXT: movl %ebp, %edi ; FALLBACK16-NEXT: movl %esi, %ebp ; FALLBACK16-NEXT: shrl %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: orl %ebx, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: shrl %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: orl %esi, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %edi, %ebp ; FALLBACK16-NEXT: movl 20(%edi), %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: movl 16(%edi), %esi ; FALLBACK16-NEXT: movl %esi, %edx ; FALLBACK16-NEXT: shrl %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: orl %ebx, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK16-NEXT: shrl %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: orl %esi, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %ebp, %edx ; FALLBACK16-NEXT: movl 28(%ebp), %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: movl 24(%ebp), %esi ; FALLBACK16-NEXT: movl %esi, %edi ; FALLBACK16-NEXT: shrl %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: orl %ebx, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK16-NEXT: shrl %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: orl %esi, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 36(%edx), %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: movl 32(%edx), %esi ; FALLBACK16-NEXT: movl %edx, %ebp ; FALLBACK16-NEXT: movl %esi, %edi ; FALLBACK16-NEXT: shrl %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: orl %ebx, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: shrl %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: orl %esi, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 44(%ebp), %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: movl 40(%ebp), %esi ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %esi, %edx ; FALLBACK16-NEXT: shrl %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: orl %ebx, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: shrl %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: orl %esi, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 52(%ebp), %esi ; FALLBACK16-NEXT: movl %esi, %edi ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: negl %edx ; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx ; FALLBACK16-NEXT: movl %ebx, %ebp ; FALLBACK16-NEXT: shrl %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %ebp ; FALLBACK16-NEXT: orl %edi, %ebp ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: shrl %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: orl %ebx, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK16-NEXT: movl 60(%edi), %edx ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: movl 56(%edi), %ebx ; FALLBACK16-NEXT: movl %ebx, %edi ; FALLBACK16-NEXT: shrl %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: orl %edx, %edi ; FALLBACK16-NEXT: movb %al, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: shrl %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: orl %ebx, %esi ; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl %edx, (%eax) ; FALLBACK16-NEXT: movl %esi, 56(%eax) ; FALLBACK16-NEXT: movl %edi, 60(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 48(%eax) ; FALLBACK16-NEXT: movl %ebp, 52(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 40(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 44(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 32(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 36(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 24(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 28(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 16(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 20(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 8(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 12(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 4(%eax) ; FALLBACK16-NEXT: addl $204, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx ; FALLBACK16-NEXT: popl %ebp ; FALLBACK16-NEXT: retl ; ; FALLBACK17-LABEL: shl_64bytes: ; FALLBACK17: # %bb.0: ; FALLBACK17-NEXT: pushl %ebp ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $188, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl (%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 4(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 8(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 12(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 16(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 20(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 24(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 28(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 32(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 36(%ecx), %eax ; FALLBACK17-NEXT: movl %eax, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: movl 40(%ecx), %ebp ; FALLBACK17-NEXT: movl 44(%ecx), %ebx ; FALLBACK17-NEXT: movl 48(%ecx), %edi ; FALLBACK17-NEXT: movl 52(%ecx), %esi ; FALLBACK17-NEXT: movl 56(%ecx), %edx ; FALLBACK17-NEXT: movl 60(%ecx), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl (%ecx), %ecx ; FALLBACK17-NEXT: xorps %xmm0, %xmm0 ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ecx, %ebp ; FALLBACK17-NEXT: andl $60, %ebp ; FALLBACK17-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: subl %ebp, %eax ; FALLBACK17-NEXT: movl 8(%eax), %esi ; FALLBACK17-NEXT: movl 12(%eax), %edx ; FALLBACK17-NEXT: shll $3, %ecx ; FALLBACK17-NEXT: andl $24, %ecx ; FALLBACK17-NEXT: movl %edx, %edi ; FALLBACK17-NEXT: shldl %cl, %esi, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 4(%eax), %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %edi, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 16(%eax), %edi ; FALLBACK17-NEXT: movl 20(%eax), %esi ; FALLBACK17-NEXT: movl %esi, %ebx ; FALLBACK17-NEXT: shldl %cl, %edi, %ebx ; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %edx, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 24(%eax), %edi ; FALLBACK17-NEXT: movl 28(%eax), %edx ; FALLBACK17-NEXT: movl %edx, %ebx ; FALLBACK17-NEXT: shldl %cl, %edi, %ebx ; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %esi, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 32(%eax), %edi ; FALLBACK17-NEXT: movl 36(%eax), %esi ; FALLBACK17-NEXT: movl %esi, %ebx ; FALLBACK17-NEXT: shldl %cl, %edi, %ebx ; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %edx, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 40(%eax), %edx ; FALLBACK17-NEXT: movl 44(%eax), %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %edx, %edi ; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: shldl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 56(%eax), %edx ; FALLBACK17-NEXT: movl 60(%eax), %edi ; FALLBACK17-NEXT: shldl %cl, %edx, %edi ; FALLBACK17-NEXT: movl (%eax), %ebx ; FALLBACK17-NEXT: movl 52(%eax), %esi ; FALLBACK17-NEXT: shldl %cl, %esi, %edx ; FALLBACK17-NEXT: negl %ebp ; FALLBACK17-NEXT: movl 160(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK17-NEXT: movl %edx, 56(%ebp) ; FALLBACK17-NEXT: movl %edi, 60(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: shldl %cl, %ebx, %edx ; FALLBACK17-NEXT: shll %cl, %ebx ; FALLBACK17-NEXT: shldl %cl, %eax, %esi ; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK17-NEXT: shldl %cl, %edi, %eax ; FALLBACK17-NEXT: movl %eax, 48(%ebp) ; FALLBACK17-NEXT: movl %esi, 52(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 40(%ebp) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 44(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 32(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 36(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 24(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 28(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 16(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 20(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 8(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 12(%ebp) ; FALLBACK17-NEXT: movl %ebx, (%ebp) ; FALLBACK17-NEXT: movl %edx, 4(%ebp) ; FALLBACK17-NEXT: addl $188, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx ; FALLBACK17-NEXT: popl %ebp ; FALLBACK17-NEXT: retl ; ; FALLBACK18-LABEL: shl_64bytes: ; FALLBACK18: # %bb.0: ; FALLBACK18-NEXT: pushl %ebp ; FALLBACK18-NEXT: pushl %ebx ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $204, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl (%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 4(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 8(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 12(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 16(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 20(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 24(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 28(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 32(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 36(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 40(%eax), %ebx ; FALLBACK18-NEXT: movl 44(%eax), %edi ; FALLBACK18-NEXT: movl 48(%eax), %esi ; FALLBACK18-NEXT: movl 52(%eax), %edx ; FALLBACK18-NEXT: movl 56(%eax), %ecx ; FALLBACK18-NEXT: movl 60(%eax), %eax ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK18-NEXT: movl (%ebp), %ebp ; FALLBACK18-NEXT: xorps %xmm0, %xmm0 ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: leal (,%ebp,8), %edx ; FALLBACK18-NEXT: andl $24, %edx ; FALLBACK18-NEXT: andl $60, %ebp ; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi ; FALLBACK18-NEXT: subl %ebp, %edi ; FALLBACK18-NEXT: movl (%edi), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 4(%edi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl %edx, %ebx ; FALLBACK18-NEXT: notb %bl ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi ; FALLBACK18-NEXT: shlxl %edx, %eax, %ecx ; FALLBACK18-NEXT: orl %ecx, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 8(%edi), %esi ; FALLBACK18-NEXT: movl %esi, %ecx ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK18-NEXT: movl 12(%edi), %ecx ; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, %esi, %esi ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: shrl %eax ; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 16(%edi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %eax ; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK18-NEXT: movl 20(%edi), %esi ; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK18-NEXT: orl %eax, %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 24(%edi), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK18-NEXT: movl 28(%edi), %ecx ; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %esi ; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: orl %eax, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 32(%edi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %eax ; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK18-NEXT: movl 36(%edi), %esi ; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK18-NEXT: orl %eax, %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 40(%edi), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK18-NEXT: movl 44(%edi), %ecx ; FALLBACK18-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %esi ; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: orl %eax, %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 48(%edi), %esi ; FALLBACK18-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrl %esi ; FALLBACK18-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK18-NEXT: movl 52(%edi), %esi ; FALLBACK18-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %ebp ; FALLBACK18-NEXT: orl %eax, %ebp ; FALLBACK18-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: negl %eax ; FALLBACK18-NEXT: shlxl %edx, 188(%esp,%eax), %ecx ; FALLBACK18-NEXT: movl 56(%edi), %eax ; FALLBACK18-NEXT: shlxl %edx, %eax, %edx ; FALLBACK18-NEXT: shrl %esi ; FALLBACK18-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK18-NEXT: orl %edx, %esi ; FALLBACK18-NEXT: shrl %eax ; FALLBACK18-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK18-NEXT: orl %eax, %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, (%eax) ; FALLBACK18-NEXT: movl %esi, 56(%eax) ; FALLBACK18-NEXT: movl %ecx, 60(%eax) ; FALLBACK18-NEXT: movl %ebp, 48(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 52(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 32(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 36(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 24(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 28(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 16(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 20(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 8(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 12(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 4(%eax) ; FALLBACK18-NEXT: addl $204, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi ; FALLBACK18-NEXT: popl %ebx ; FALLBACK18-NEXT: popl %ebp ; FALLBACK18-NEXT: retl ; ; FALLBACK19-LABEL: shl_64bytes: ; FALLBACK19: # %bb.0: ; FALLBACK19-NEXT: pushl %ebp ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $204, %esp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl (%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 4(%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 8(%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 12(%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 16(%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 20(%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 24(%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 28(%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 32(%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 36(%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 40(%ebp), %ebx ; FALLBACK19-NEXT: movl 44(%ebp), %edi ; FALLBACK19-NEXT: movl 48(%ebp), %esi ; FALLBACK19-NEXT: movl 52(%ebp), %edx ; FALLBACK19-NEXT: movl 56(%ebp), %ecx ; FALLBACK19-NEXT: movl 60(%ebp), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl (%ebp), %ebp ; FALLBACK19-NEXT: xorps %xmm0, %xmm0 ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: leal (,%ebp,8), %ecx ; FALLBACK19-NEXT: andl $24, %ecx ; FALLBACK19-NEXT: andl $60, %ebp ; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: subl %ebp, %eax ; FALLBACK19-NEXT: movl 4(%eax), %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 8(%eax), %edi ; FALLBACK19-NEXT: movl 12(%eax), %edx ; FALLBACK19-NEXT: movl %edx, %ebx ; FALLBACK19-NEXT: shldl %cl, %edi, %ebx ; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shldl %cl, %esi, %edi ; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 16(%eax), %edi ; FALLBACK19-NEXT: movl 20(%eax), %esi ; FALLBACK19-NEXT: movl %esi, %ebx ; FALLBACK19-NEXT: shldl %cl, %edi, %ebx ; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shldl %cl, %edx, %edi ; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 24(%eax), %edi ; FALLBACK19-NEXT: movl 28(%eax), %edx ; FALLBACK19-NEXT: movl %edx, %ebx ; FALLBACK19-NEXT: shldl %cl, %edi, %ebx ; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shldl %cl, %esi, %edi ; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 32(%eax), %edi ; FALLBACK19-NEXT: movl 36(%eax), %esi ; FALLBACK19-NEXT: movl %esi, %ebx ; FALLBACK19-NEXT: shldl %cl, %edi, %ebx ; FALLBACK19-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shldl %cl, %edx, %edi ; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 40(%eax), %ebx ; FALLBACK19-NEXT: movl 44(%eax), %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shldl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shldl %cl, %esi, %ebx ; FALLBACK19-NEXT: movl 56(%eax), %edx ; FALLBACK19-NEXT: movl 60(%eax), %edi ; FALLBACK19-NEXT: shldl %cl, %edx, %edi ; FALLBACK19-NEXT: movl (%eax), %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 52(%eax), %esi ; FALLBACK19-NEXT: shldl %cl, %esi, %edx ; FALLBACK19-NEXT: negl %ebp ; FALLBACK19-NEXT: movl 176(%esp,%ebp), %ebp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl %edx, 56(%eax) ; FALLBACK19-NEXT: movl %edi, 60(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: shlxl %ecx, %edx, %edi ; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK19-NEXT: shldl %cl, %edx, %edi ; FALLBACK19-NEXT: shldl %cl, %ebp, %esi ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: shldl %cl, %edx, %ebp ; FALLBACK19-NEXT: movl %ebp, 48(%eax) ; FALLBACK19-NEXT: movl %esi, 52(%eax) ; FALLBACK19-NEXT: movl %ebx, 40(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 44(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 32(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 36(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 24(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 28(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 16(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 20(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 8(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 12(%eax) ; FALLBACK19-NEXT: movl %edi, 4(%eax) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, (%eax) ; FALLBACK19-NEXT: addl $204, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx ; FALLBACK19-NEXT: popl %ebp ; FALLBACK19-NEXT: retl ; ; FALLBACK20-LABEL: shl_64bytes: ; FALLBACK20: # %bb.0: ; FALLBACK20-NEXT: pushl %ebp ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi ; FALLBACK20-NEXT: subl $204, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK20-NEXT: movl (%eax), %eax ; FALLBACK20-NEXT: xorps %xmm4, %xmm4 ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %eax, %edx ; FALLBACK20-NEXT: andl $60, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: subl %edx, %ecx ; FALLBACK20-NEXT: movl (%ecx), %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 4(%ecx), %edx ; FALLBACK20-NEXT: movl %ecx, %ebp ; FALLBACK20-NEXT: shll $3, %eax ; FALLBACK20-NEXT: andl $24, %eax ; FALLBACK20-NEXT: movl %edx, %esi ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: shrl %edi ; FALLBACK20-NEXT: movb %al, %ch ; FALLBACK20-NEXT: notb %ch ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: orl %esi, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 12(%ebp), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl 8(%ebp), %esi ; FALLBACK20-NEXT: movl %ebp, %edi ; FALLBACK20-NEXT: movl %esi, %ebp ; FALLBACK20-NEXT: shrl %ebp ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: orl %ebx, %ebp ; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: shrl %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edx ; FALLBACK20-NEXT: orl %esi, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %edi, %ebp ; FALLBACK20-NEXT: movl 20(%edi), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl 16(%edi), %esi ; FALLBACK20-NEXT: movl %esi, %edx ; FALLBACK20-NEXT: shrl %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edx ; FALLBACK20-NEXT: orl %ebx, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK20-NEXT: shrl %edi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: orl %esi, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %ebp, %edx ; FALLBACK20-NEXT: movl 28(%ebp), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl 24(%ebp), %esi ; FALLBACK20-NEXT: movl %esi, %edi ; FALLBACK20-NEXT: shrl %edi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: orl %ebx, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK20-NEXT: shrl %ebp ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: orl %esi, %ebp ; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 36(%edx), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl 32(%edx), %esi ; FALLBACK20-NEXT: movl %edx, %ebp ; FALLBACK20-NEXT: movl %esi, %edi ; FALLBACK20-NEXT: shrl %edi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: orl %ebx, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: shrl %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edx ; FALLBACK20-NEXT: orl %esi, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 44(%ebp), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl 40(%ebp), %esi ; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %esi, %edx ; FALLBACK20-NEXT: shrl %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edx ; FALLBACK20-NEXT: orl %ebx, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: shrl %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edx ; FALLBACK20-NEXT: orl %esi, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 52(%ebp), %esi ; FALLBACK20-NEXT: movl %esi, %edi ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: negl %edx ; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx ; FALLBACK20-NEXT: movl %ebx, %ebp ; FALLBACK20-NEXT: shrl %ebp ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: orl %edi, %ebp ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: shrl %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edx ; FALLBACK20-NEXT: orl %ebx, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK20-NEXT: movl 60(%edi), %edx ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: movl 56(%edi), %ebx ; FALLBACK20-NEXT: movl %ebx, %edi ; FALLBACK20-NEXT: shrl %edi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: orl %edx, %edi ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: shrl %esi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl %edx, (%eax) ; FALLBACK20-NEXT: movl %esi, 56(%eax) ; FALLBACK20-NEXT: movl %edi, 60(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 48(%eax) ; FALLBACK20-NEXT: movl %ebp, 52(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 40(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 44(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 32(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 36(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 24(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 28(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 16(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 20(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 8(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 12(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 4(%eax) ; FALLBACK20-NEXT: addl $204, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx ; FALLBACK20-NEXT: popl %ebp ; FALLBACK20-NEXT: retl ; ; FALLBACK21-LABEL: shl_64bytes: ; FALLBACK21: # %bb.0: ; FALLBACK21-NEXT: pushl %ebp ; FALLBACK21-NEXT: pushl %ebx ; FALLBACK21-NEXT: pushl %edi ; FALLBACK21-NEXT: pushl %esi ; FALLBACK21-NEXT: subl $188, %esp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK21-NEXT: movups (%ecx), %xmm0 ; FALLBACK21-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK21-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK21-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK21-NEXT: movl (%eax), %ecx ; FALLBACK21-NEXT: xorps %xmm4, %xmm4 ; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %ecx, %ebp ; FALLBACK21-NEXT: andl $60, %ebp ; FALLBACK21-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: subl %ebp, %eax ; FALLBACK21-NEXT: movl 8(%eax), %esi ; FALLBACK21-NEXT: movl 12(%eax), %edx ; FALLBACK21-NEXT: shll $3, %ecx ; FALLBACK21-NEXT: andl $24, %ecx ; FALLBACK21-NEXT: movl %edx, %edi ; FALLBACK21-NEXT: shldl %cl, %esi, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 4(%eax), %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %edi, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 16(%eax), %edi ; FALLBACK21-NEXT: movl 20(%eax), %esi ; FALLBACK21-NEXT: movl %esi, %ebx ; FALLBACK21-NEXT: shldl %cl, %edi, %ebx ; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %edx, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 24(%eax), %edi ; FALLBACK21-NEXT: movl 28(%eax), %edx ; FALLBACK21-NEXT: movl %edx, %ebx ; FALLBACK21-NEXT: shldl %cl, %edi, %ebx ; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %esi, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 32(%eax), %edi ; FALLBACK21-NEXT: movl 36(%eax), %esi ; FALLBACK21-NEXT: movl %esi, %ebx ; FALLBACK21-NEXT: shldl %cl, %edi, %ebx ; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %edx, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 40(%eax), %edx ; FALLBACK21-NEXT: movl 44(%eax), %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %edx, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shldl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK21-NEXT: movl 56(%eax), %edx ; FALLBACK21-NEXT: movl 60(%eax), %edi ; FALLBACK21-NEXT: shldl %cl, %edx, %edi ; FALLBACK21-NEXT: movl (%eax), %ebx ; FALLBACK21-NEXT: movl 52(%eax), %esi ; FALLBACK21-NEXT: shldl %cl, %esi, %edx ; FALLBACK21-NEXT: negl %ebp ; FALLBACK21-NEXT: movl 160(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK21-NEXT: movl %edx, 56(%ebp) ; FALLBACK21-NEXT: movl %edi, 60(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK21-NEXT: shldl %cl, %ebx, %edx ; FALLBACK21-NEXT: shll %cl, %ebx ; FALLBACK21-NEXT: shldl %cl, %eax, %esi ; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK21-NEXT: shldl %cl, %edi, %eax ; FALLBACK21-NEXT: movl %eax, 48(%ebp) ; FALLBACK21-NEXT: movl %esi, 52(%ebp) ; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 40(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 44(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 32(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 36(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 24(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 28(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 16(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 20(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 8(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 12(%ebp) ; FALLBACK21-NEXT: movl %ebx, (%ebp) ; FALLBACK21-NEXT: movl %edx, 4(%ebp) ; FALLBACK21-NEXT: addl $188, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi ; FALLBACK21-NEXT: popl %ebx ; FALLBACK21-NEXT: popl %ebp ; FALLBACK21-NEXT: retl ; ; FALLBACK22-LABEL: shl_64bytes: ; FALLBACK22: # %bb.0: ; FALLBACK22-NEXT: pushl %ebp ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi ; FALLBACK22-NEXT: subl $204, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK22-NEXT: movl (%eax), %eax ; FALLBACK22-NEXT: xorps %xmm4, %xmm4 ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: leal (,%eax,8), %edx ; FALLBACK22-NEXT: andl $24, %edx ; FALLBACK22-NEXT: andl $60, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi ; FALLBACK22-NEXT: subl %eax, %edi ; FALLBACK22-NEXT: movl (%edi), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 4(%edi), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: notb %bl ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi ; FALLBACK22-NEXT: shlxl %edx, %eax, %ecx ; FALLBACK22-NEXT: orl %ecx, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 8(%edi), %esi ; FALLBACK22-NEXT: movl %esi, %ecx ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK22-NEXT: movl 12(%edi), %ecx ; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %edx, %esi, %esi ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: shrl %eax ; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 16(%edi), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %eax ; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK22-NEXT: movl 20(%edi), %esi ; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: orl %eax, %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 24(%edi), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK22-NEXT: movl 28(%edi), %ecx ; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %eax, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 32(%edi), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %eax ; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK22-NEXT: movl 36(%edi), %esi ; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: orl %eax, %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 40(%edi), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK22-NEXT: movl 44(%edi), %ecx ; FALLBACK22-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %eax, %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 48(%edi), %esi ; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK22-NEXT: movl 52(%edi), %esi ; FALLBACK22-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %ebp ; FALLBACK22-NEXT: orl %eax, %ebp ; FALLBACK22-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK22-NEXT: negl %eax ; FALLBACK22-NEXT: shlxl %edx, 188(%esp,%eax), %ecx ; FALLBACK22-NEXT: movl 56(%edi), %eax ; FALLBACK22-NEXT: shlxl %edx, %eax, %edx ; FALLBACK22-NEXT: shrl %esi ; FALLBACK22-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %edx, %esi ; FALLBACK22-NEXT: shrl %eax ; FALLBACK22-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK22-NEXT: orl %eax, %ecx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK22-NEXT: movl %edx, (%eax) ; FALLBACK22-NEXT: movl %esi, 56(%eax) ; FALLBACK22-NEXT: movl %ecx, 60(%eax) ; FALLBACK22-NEXT: movl %ebp, 48(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 52(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 32(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 36(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 24(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 28(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 16(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 20(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 8(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 12(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 4(%eax) ; FALLBACK22-NEXT: addl $204, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx ; FALLBACK22-NEXT: popl %ebp ; FALLBACK22-NEXT: retl ; ; FALLBACK23-LABEL: shl_64bytes: ; FALLBACK23: # %bb.0: ; FALLBACK23-NEXT: pushl %ebp ; FALLBACK23-NEXT: pushl %ebx ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi ; FALLBACK23-NEXT: subl $204, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK23-NEXT: movups (%ecx), %xmm0 ; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK23-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK23-NEXT: movups 48(%ecx), %xmm3 ; FALLBACK23-NEXT: movl (%eax), %ebp ; FALLBACK23-NEXT: xorps %xmm4, %xmm4 ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: leal (,%ebp,8), %ecx ; FALLBACK23-NEXT: andl $24, %ecx ; FALLBACK23-NEXT: andl $60, %ebp ; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: subl %ebp, %eax ; FALLBACK23-NEXT: movl 4(%eax), %esi ; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 8(%eax), %edi ; FALLBACK23-NEXT: movl 12(%eax), %edx ; FALLBACK23-NEXT: movl %edx, %ebx ; FALLBACK23-NEXT: shldl %cl, %edi, %ebx ; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %esi, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 16(%eax), %edi ; FALLBACK23-NEXT: movl 20(%eax), %esi ; FALLBACK23-NEXT: movl %esi, %ebx ; FALLBACK23-NEXT: shldl %cl, %edi, %ebx ; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %edx, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 24(%eax), %edi ; FALLBACK23-NEXT: movl 28(%eax), %edx ; FALLBACK23-NEXT: movl %edx, %ebx ; FALLBACK23-NEXT: shldl %cl, %edi, %ebx ; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %esi, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 32(%eax), %edi ; FALLBACK23-NEXT: movl 36(%eax), %esi ; FALLBACK23-NEXT: movl %esi, %ebx ; FALLBACK23-NEXT: shldl %cl, %edi, %ebx ; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %edx, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 40(%eax), %ebx ; FALLBACK23-NEXT: movl 44(%eax), %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %ebx, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %esi, %ebx ; FALLBACK23-NEXT: movl 56(%eax), %edx ; FALLBACK23-NEXT: movl 60(%eax), %edi ; FALLBACK23-NEXT: shldl %cl, %edx, %edi ; FALLBACK23-NEXT: movl (%eax), %esi ; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 52(%eax), %esi ; FALLBACK23-NEXT: shldl %cl, %esi, %edx ; FALLBACK23-NEXT: negl %ebp ; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl %edx, 56(%eax) ; FALLBACK23-NEXT: movl %edi, 60(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK23-NEXT: shlxl %ecx, %edx, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK23-NEXT: shldl %cl, %edx, %edi ; FALLBACK23-NEXT: shldl %cl, %ebp, %esi ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK23-NEXT: shldl %cl, %edx, %ebp ; FALLBACK23-NEXT: movl %ebp, 48(%eax) ; FALLBACK23-NEXT: movl %esi, 52(%eax) ; FALLBACK23-NEXT: movl %ebx, 40(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 44(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 32(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 36(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 24(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 28(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 16(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 20(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 8(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 12(%eax) ; FALLBACK23-NEXT: movl %edi, 4(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, (%eax) ; FALLBACK23-NEXT: addl $204, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi ; FALLBACK23-NEXT: popl %ebx ; FALLBACK23-NEXT: popl %ebp ; FALLBACK23-NEXT: retl ; ; FALLBACK24-LABEL: shl_64bytes: ; FALLBACK24: # %bb.0: ; FALLBACK24-NEXT: pushl %ebp ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi ; FALLBACK24-NEXT: subl $204, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 ; FALLBACK24-NEXT: movl (%eax), %eax ; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %eax, %edx ; FALLBACK24-NEXT: andl $60, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: subl %edx, %ecx ; FALLBACK24-NEXT: movl (%ecx), %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 4(%ecx), %edx ; FALLBACK24-NEXT: movl %ecx, %ebp ; FALLBACK24-NEXT: shll $3, %eax ; FALLBACK24-NEXT: andl $24, %eax ; FALLBACK24-NEXT: movl %edx, %esi ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: shrl %edi ; FALLBACK24-NEXT: movb %al, %ch ; FALLBACK24-NEXT: notb %ch ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: orl %esi, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 12(%ebp), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl 8(%ebp), %esi ; FALLBACK24-NEXT: movl %ebp, %edi ; FALLBACK24-NEXT: movl %esi, %ebp ; FALLBACK24-NEXT: shrl %ebp ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: orl %ebx, %ebp ; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: shrl %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edx ; FALLBACK24-NEXT: orl %esi, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %edi, %ebp ; FALLBACK24-NEXT: movl 20(%edi), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl 16(%edi), %esi ; FALLBACK24-NEXT: movl %esi, %edx ; FALLBACK24-NEXT: shrl %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edx ; FALLBACK24-NEXT: orl %ebx, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK24-NEXT: shrl %edi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: orl %esi, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %ebp, %edx ; FALLBACK24-NEXT: movl 28(%ebp), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl 24(%ebp), %esi ; FALLBACK24-NEXT: movl %esi, %edi ; FALLBACK24-NEXT: shrl %edi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: orl %ebx, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK24-NEXT: shrl %ebp ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: orl %esi, %ebp ; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 36(%edx), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl 32(%edx), %esi ; FALLBACK24-NEXT: movl %edx, %ebp ; FALLBACK24-NEXT: movl %esi, %edi ; FALLBACK24-NEXT: shrl %edi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: orl %ebx, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: shrl %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edx ; FALLBACK24-NEXT: orl %esi, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 44(%ebp), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl 40(%ebp), %esi ; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %esi, %edx ; FALLBACK24-NEXT: shrl %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edx ; FALLBACK24-NEXT: orl %ebx, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: shrl %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edx ; FALLBACK24-NEXT: orl %esi, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 52(%ebp), %esi ; FALLBACK24-NEXT: movl %esi, %edi ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: negl %edx ; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx ; FALLBACK24-NEXT: movl %ebx, %ebp ; FALLBACK24-NEXT: shrl %ebp ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: orl %edi, %ebp ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: shrl %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edx ; FALLBACK24-NEXT: orl %ebx, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK24-NEXT: movl 60(%edi), %edx ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: movl 56(%edi), %ebx ; FALLBACK24-NEXT: movl %ebx, %edi ; FALLBACK24-NEXT: shrl %edi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: orl %edx, %edi ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: shrl %esi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl %edx, (%eax) ; FALLBACK24-NEXT: movl %esi, 56(%eax) ; FALLBACK24-NEXT: movl %edi, 60(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 48(%eax) ; FALLBACK24-NEXT: movl %ebp, 52(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 40(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 44(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 32(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 36(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 24(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 28(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 16(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 20(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 8(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 12(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 4(%eax) ; FALLBACK24-NEXT: addl $204, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx ; FALLBACK24-NEXT: popl %ebp ; FALLBACK24-NEXT: vzeroupper ; FALLBACK24-NEXT: retl ; ; FALLBACK25-LABEL: shl_64bytes: ; FALLBACK25: # %bb.0: ; FALLBACK25-NEXT: pushl %ebp ; FALLBACK25-NEXT: pushl %ebx ; FALLBACK25-NEXT: pushl %edi ; FALLBACK25-NEXT: pushl %esi ; FALLBACK25-NEXT: subl $188, %esp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK25-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK25-NEXT: vmovups 32(%ecx), %ymm1 ; FALLBACK25-NEXT: movl (%eax), %ecx ; FALLBACK25-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %ecx, %ebp ; FALLBACK25-NEXT: andl $60, %ebp ; FALLBACK25-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: subl %ebp, %eax ; FALLBACK25-NEXT: movl 8(%eax), %esi ; FALLBACK25-NEXT: movl 12(%eax), %edx ; FALLBACK25-NEXT: shll $3, %ecx ; FALLBACK25-NEXT: andl $24, %ecx ; FALLBACK25-NEXT: movl %edx, %edi ; FALLBACK25-NEXT: shldl %cl, %esi, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 4(%eax), %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %edi, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 16(%eax), %edi ; FALLBACK25-NEXT: movl 20(%eax), %esi ; FALLBACK25-NEXT: movl %esi, %ebx ; FALLBACK25-NEXT: shldl %cl, %edi, %ebx ; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %edx, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 24(%eax), %edi ; FALLBACK25-NEXT: movl 28(%eax), %edx ; FALLBACK25-NEXT: movl %edx, %ebx ; FALLBACK25-NEXT: shldl %cl, %edi, %ebx ; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %esi, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 32(%eax), %edi ; FALLBACK25-NEXT: movl 36(%eax), %esi ; FALLBACK25-NEXT: movl %esi, %ebx ; FALLBACK25-NEXT: shldl %cl, %edi, %ebx ; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %edx, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 40(%eax), %edx ; FALLBACK25-NEXT: movl 44(%eax), %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %edx, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shldl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK25-NEXT: movl 56(%eax), %edx ; FALLBACK25-NEXT: movl 60(%eax), %edi ; FALLBACK25-NEXT: shldl %cl, %edx, %edi ; FALLBACK25-NEXT: movl (%eax), %ebx ; FALLBACK25-NEXT: movl 52(%eax), %esi ; FALLBACK25-NEXT: shldl %cl, %esi, %edx ; FALLBACK25-NEXT: negl %ebp ; FALLBACK25-NEXT: movl 160(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK25-NEXT: movl %edx, 56(%ebp) ; FALLBACK25-NEXT: movl %edi, 60(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK25-NEXT: shldl %cl, %ebx, %edx ; FALLBACK25-NEXT: shll %cl, %ebx ; FALLBACK25-NEXT: shldl %cl, %eax, %esi ; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK25-NEXT: shldl %cl, %edi, %eax ; FALLBACK25-NEXT: movl %eax, 48(%ebp) ; FALLBACK25-NEXT: movl %esi, 52(%ebp) ; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 40(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 44(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 32(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 36(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 24(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 28(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 16(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 20(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 8(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 12(%ebp) ; FALLBACK25-NEXT: movl %ebx, (%ebp) ; FALLBACK25-NEXT: movl %edx, 4(%ebp) ; FALLBACK25-NEXT: addl $188, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi ; FALLBACK25-NEXT: popl %ebx ; FALLBACK25-NEXT: popl %ebp ; FALLBACK25-NEXT: vzeroupper ; FALLBACK25-NEXT: retl ; ; FALLBACK26-LABEL: shl_64bytes: ; FALLBACK26: # %bb.0: ; FALLBACK26-NEXT: pushl %ebp ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi ; FALLBACK26-NEXT: subl $204, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK26-NEXT: vmovups 32(%ecx), %ymm1 ; FALLBACK26-NEXT: movl (%eax), %eax ; FALLBACK26-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: leal (,%eax,8), %edx ; FALLBACK26-NEXT: andl $24, %edx ; FALLBACK26-NEXT: andl $60, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi ; FALLBACK26-NEXT: subl %eax, %edi ; FALLBACK26-NEXT: movl (%edi), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 4(%edi), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: notb %bl ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi ; FALLBACK26-NEXT: shlxl %edx, %eax, %ecx ; FALLBACK26-NEXT: orl %ecx, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 8(%edi), %esi ; FALLBACK26-NEXT: movl %esi, %ecx ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK26-NEXT: movl 12(%edi), %ecx ; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %edx, %esi, %esi ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: shrl %eax ; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 16(%edi), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %eax ; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK26-NEXT: movl 20(%edi), %esi ; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: orl %eax, %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 24(%edi), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK26-NEXT: movl 28(%edi), %ecx ; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: orl %eax, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 32(%edi), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %eax ; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK26-NEXT: movl 36(%edi), %esi ; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: orl %eax, %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 40(%edi), %ecx ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK26-NEXT: movl 44(%edi), %ecx ; FALLBACK26-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: orl %eax, %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 48(%edi), %esi ; FALLBACK26-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK26-NEXT: movl 52(%edi), %esi ; FALLBACK26-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %ebp ; FALLBACK26-NEXT: orl %eax, %ebp ; FALLBACK26-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK26-NEXT: negl %eax ; FALLBACK26-NEXT: shlxl %edx, 188(%esp,%eax), %ecx ; FALLBACK26-NEXT: movl 56(%edi), %eax ; FALLBACK26-NEXT: shlxl %edx, %eax, %edx ; FALLBACK26-NEXT: shrl %esi ; FALLBACK26-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK26-NEXT: orl %edx, %esi ; FALLBACK26-NEXT: shrl %eax ; FALLBACK26-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK26-NEXT: orl %eax, %ecx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK26-NEXT: movl %edx, (%eax) ; FALLBACK26-NEXT: movl %esi, 56(%eax) ; FALLBACK26-NEXT: movl %ecx, 60(%eax) ; FALLBACK26-NEXT: movl %ebp, 48(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 52(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 40(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 44(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 32(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 36(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 24(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 28(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 16(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 20(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 8(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 12(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 4(%eax) ; FALLBACK26-NEXT: addl $204, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx ; FALLBACK26-NEXT: popl %ebp ; FALLBACK26-NEXT: vzeroupper ; FALLBACK26-NEXT: retl ; ; FALLBACK27-LABEL: shl_64bytes: ; FALLBACK27: # %bb.0: ; FALLBACK27-NEXT: pushl %ebp ; FALLBACK27-NEXT: pushl %ebx ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi ; FALLBACK27-NEXT: subl $204, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK27-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK27-NEXT: vmovups 32(%ecx), %ymm1 ; FALLBACK27-NEXT: movl (%eax), %ebx ; FALLBACK27-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: leal (,%ebx,8), %ecx ; FALLBACK27-NEXT: andl $24, %ecx ; FALLBACK27-NEXT: andl $60, %ebx ; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: subl %ebx, %eax ; FALLBACK27-NEXT: movl 4(%eax), %esi ; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 8(%eax), %edi ; FALLBACK27-NEXT: movl 12(%eax), %edx ; FALLBACK27-NEXT: movl %edx, %ebp ; FALLBACK27-NEXT: shldl %cl, %edi, %ebp ; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %esi, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 16(%eax), %edi ; FALLBACK27-NEXT: movl 20(%eax), %esi ; FALLBACK27-NEXT: movl %esi, %ebp ; FALLBACK27-NEXT: shldl %cl, %edi, %ebp ; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %edx, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 24(%eax), %edi ; FALLBACK27-NEXT: movl 28(%eax), %edx ; FALLBACK27-NEXT: movl %edx, %ebp ; FALLBACK27-NEXT: shldl %cl, %edi, %ebp ; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %esi, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 32(%eax), %edi ; FALLBACK27-NEXT: movl 36(%eax), %esi ; FALLBACK27-NEXT: movl %esi, %ebp ; FALLBACK27-NEXT: shldl %cl, %edi, %ebp ; FALLBACK27-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %edx, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 40(%eax), %ebp ; FALLBACK27-NEXT: movl 44(%eax), %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %ebp, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shldl %cl, %esi, %ebp ; FALLBACK27-NEXT: movl 56(%eax), %edx ; FALLBACK27-NEXT: movl 60(%eax), %edi ; FALLBACK27-NEXT: shldl %cl, %edx, %edi ; FALLBACK27-NEXT: movl (%eax), %esi ; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 52(%eax), %esi ; FALLBACK27-NEXT: shldl %cl, %esi, %edx ; FALLBACK27-NEXT: negl %ebx ; FALLBACK27-NEXT: movl 176(%esp,%ebx), %ebx ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: movl %edx, 56(%eax) ; FALLBACK27-NEXT: movl %edi, 60(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK27-NEXT: shlxl %ecx, %edx, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK27-NEXT: shldl %cl, %edx, %edi ; FALLBACK27-NEXT: shldl %cl, %ebx, %esi ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK27-NEXT: shldl %cl, %edx, %ebx ; FALLBACK27-NEXT: movl %ebx, 48(%eax) ; FALLBACK27-NEXT: movl %esi, 52(%eax) ; FALLBACK27-NEXT: movl %ebp, 40(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 44(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 32(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 36(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 24(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 28(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 16(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 20(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 8(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 12(%eax) ; FALLBACK27-NEXT: movl %edi, 4(%eax) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, (%eax) ; FALLBACK27-NEXT: addl $204, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi ; FALLBACK27-NEXT: popl %ebx ; FALLBACK27-NEXT: popl %ebp ; FALLBACK27-NEXT: vzeroupper ; FALLBACK27-NEXT: retl ; ; FALLBACK28-LABEL: shl_64bytes: ; FALLBACK28: # %bb.0: ; FALLBACK28-NEXT: pushl %ebp ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi ; FALLBACK28-NEXT: subl $204, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 ; FALLBACK28-NEXT: movl (%eax), %eax ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %eax, %edx ; FALLBACK28-NEXT: andl $60, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: subl %edx, %ecx ; FALLBACK28-NEXT: movl (%ecx), %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 4(%ecx), %edx ; FALLBACK28-NEXT: movl %ecx, %ebp ; FALLBACK28-NEXT: shll $3, %eax ; FALLBACK28-NEXT: andl $24, %eax ; FALLBACK28-NEXT: movl %edx, %esi ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: shrl %edi ; FALLBACK28-NEXT: movb %al, %ch ; FALLBACK28-NEXT: notb %ch ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: orl %esi, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 12(%ebp), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl 8(%ebp), %esi ; FALLBACK28-NEXT: movl %ebp, %edi ; FALLBACK28-NEXT: movl %esi, %ebp ; FALLBACK28-NEXT: shrl %ebp ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: orl %ebx, %ebp ; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: shrl %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edx ; FALLBACK28-NEXT: orl %esi, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %edi, %ebp ; FALLBACK28-NEXT: movl 20(%edi), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl 16(%edi), %esi ; FALLBACK28-NEXT: movl %esi, %edx ; FALLBACK28-NEXT: shrl %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edx ; FALLBACK28-NEXT: orl %ebx, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK28-NEXT: shrl %edi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: orl %esi, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %ebp, %edx ; FALLBACK28-NEXT: movl 28(%ebp), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl 24(%ebp), %esi ; FALLBACK28-NEXT: movl %esi, %edi ; FALLBACK28-NEXT: shrl %edi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: orl %ebx, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK28-NEXT: shrl %ebp ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: orl %esi, %ebp ; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 36(%edx), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl 32(%edx), %esi ; FALLBACK28-NEXT: movl %edx, %ebp ; FALLBACK28-NEXT: movl %esi, %edi ; FALLBACK28-NEXT: shrl %edi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: orl %ebx, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: shrl %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edx ; FALLBACK28-NEXT: orl %esi, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 44(%ebp), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl 40(%ebp), %esi ; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %esi, %edx ; FALLBACK28-NEXT: shrl %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edx ; FALLBACK28-NEXT: orl %ebx, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: shrl %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edx ; FALLBACK28-NEXT: orl %esi, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 52(%ebp), %esi ; FALLBACK28-NEXT: movl %esi, %edi ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: negl %edx ; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx ; FALLBACK28-NEXT: movl %ebx, %ebp ; FALLBACK28-NEXT: shrl %ebp ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: orl %edi, %ebp ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: shrl %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edx ; FALLBACK28-NEXT: orl %ebx, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK28-NEXT: movl 60(%edi), %edx ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: movl 56(%edi), %ebx ; FALLBACK28-NEXT: movl %ebx, %edi ; FALLBACK28-NEXT: shrl %edi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: orl %edx, %edi ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: shrl %esi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl %edx, (%eax) ; FALLBACK28-NEXT: movl %esi, 56(%eax) ; FALLBACK28-NEXT: movl %edi, 60(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 48(%eax) ; FALLBACK28-NEXT: movl %ebp, 52(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 40(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 44(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 32(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 36(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 24(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 28(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 16(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 20(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 8(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 12(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 4(%eax) ; FALLBACK28-NEXT: addl $204, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx ; FALLBACK28-NEXT: popl %ebp ; FALLBACK28-NEXT: vzeroupper ; FALLBACK28-NEXT: retl ; ; FALLBACK29-LABEL: shl_64bytes: ; FALLBACK29: # %bb.0: ; FALLBACK29-NEXT: pushl %ebp ; FALLBACK29-NEXT: pushl %ebx ; FALLBACK29-NEXT: pushl %edi ; FALLBACK29-NEXT: pushl %esi ; FALLBACK29-NEXT: subl $188, %esp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK29-NEXT: vmovups (%ecx), %zmm0 ; FALLBACK29-NEXT: movl (%eax), %ecx ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %ecx, %ebp ; FALLBACK29-NEXT: andl $60, %ebp ; FALLBACK29-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: subl %ebp, %eax ; FALLBACK29-NEXT: movl 8(%eax), %esi ; FALLBACK29-NEXT: movl 12(%eax), %edx ; FALLBACK29-NEXT: shll $3, %ecx ; FALLBACK29-NEXT: andl $24, %ecx ; FALLBACK29-NEXT: movl %edx, %edi ; FALLBACK29-NEXT: shldl %cl, %esi, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 4(%eax), %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %edi, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 16(%eax), %edi ; FALLBACK29-NEXT: movl 20(%eax), %esi ; FALLBACK29-NEXT: movl %esi, %ebx ; FALLBACK29-NEXT: shldl %cl, %edi, %ebx ; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %edx, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 24(%eax), %edi ; FALLBACK29-NEXT: movl 28(%eax), %edx ; FALLBACK29-NEXT: movl %edx, %ebx ; FALLBACK29-NEXT: shldl %cl, %edi, %ebx ; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %esi, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 32(%eax), %edi ; FALLBACK29-NEXT: movl 36(%eax), %esi ; FALLBACK29-NEXT: movl %esi, %ebx ; FALLBACK29-NEXT: shldl %cl, %edi, %ebx ; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %edx, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 40(%eax), %edx ; FALLBACK29-NEXT: movl 44(%eax), %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %edx, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shldl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK29-NEXT: movl 56(%eax), %edx ; FALLBACK29-NEXT: movl 60(%eax), %edi ; FALLBACK29-NEXT: shldl %cl, %edx, %edi ; FALLBACK29-NEXT: movl (%eax), %ebx ; FALLBACK29-NEXT: movl 52(%eax), %esi ; FALLBACK29-NEXT: shldl %cl, %esi, %edx ; FALLBACK29-NEXT: negl %ebp ; FALLBACK29-NEXT: movl 160(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK29-NEXT: movl %edx, 56(%ebp) ; FALLBACK29-NEXT: movl %edi, 60(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK29-NEXT: shldl %cl, %ebx, %edx ; FALLBACK29-NEXT: shll %cl, %ebx ; FALLBACK29-NEXT: shldl %cl, %eax, %esi ; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK29-NEXT: shldl %cl, %edi, %eax ; FALLBACK29-NEXT: movl %eax, 48(%ebp) ; FALLBACK29-NEXT: movl %esi, 52(%ebp) ; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 40(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 44(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 32(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 36(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 24(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 28(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 16(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 20(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 8(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 12(%ebp) ; FALLBACK29-NEXT: movl %ebx, (%ebp) ; FALLBACK29-NEXT: movl %edx, 4(%ebp) ; FALLBACK29-NEXT: addl $188, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi ; FALLBACK29-NEXT: popl %ebx ; FALLBACK29-NEXT: popl %ebp ; FALLBACK29-NEXT: vzeroupper ; FALLBACK29-NEXT: retl ; ; FALLBACK30-LABEL: shl_64bytes: ; FALLBACK30: # %bb.0: ; FALLBACK30-NEXT: pushl %ebp ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi ; FALLBACK30-NEXT: subl $204, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %zmm0 ; FALLBACK30-NEXT: movl (%eax), %eax ; FALLBACK30-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: leal (,%eax,8), %edx ; FALLBACK30-NEXT: andl $24, %edx ; FALLBACK30-NEXT: andl $60, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi ; FALLBACK30-NEXT: subl %eax, %edi ; FALLBACK30-NEXT: movl (%edi), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 4(%edi), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl %edx, %ebx ; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi ; FALLBACK30-NEXT: shlxl %edx, %eax, %ecx ; FALLBACK30-NEXT: orl %ecx, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 8(%edi), %esi ; FALLBACK30-NEXT: movl %esi, %ecx ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK30-NEXT: movl 12(%edi), %ecx ; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %edx, %esi, %esi ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: shrl %eax ; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 16(%edi), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %eax ; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK30-NEXT: movl 20(%edi), %esi ; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: orl %eax, %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 24(%edi), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK30-NEXT: movl 28(%edi), %ecx ; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: orl %eax, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 32(%edi), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %eax ; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK30-NEXT: movl 36(%edi), %esi ; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: orl %eax, %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 40(%edi), %ecx ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %eax ; FALLBACK30-NEXT: movl 44(%edi), %ecx ; FALLBACK30-NEXT: shlxl %edx, %ecx, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: orl %eax, %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 48(%edi), %esi ; FALLBACK30-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %eax ; FALLBACK30-NEXT: movl 52(%edi), %esi ; FALLBACK30-NEXT: shlxl %edx, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %ebp ; FALLBACK30-NEXT: orl %eax, %ebp ; FALLBACK30-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK30-NEXT: negl %eax ; FALLBACK30-NEXT: shlxl %edx, 188(%esp,%eax), %ecx ; FALLBACK30-NEXT: movl 56(%edi), %eax ; FALLBACK30-NEXT: shlxl %edx, %eax, %edx ; FALLBACK30-NEXT: shrl %esi ; FALLBACK30-NEXT: shrxl %ebx, %esi, %esi ; FALLBACK30-NEXT: orl %edx, %esi ; FALLBACK30-NEXT: shrl %eax ; FALLBACK30-NEXT: shrxl %ebx, %eax, %eax ; FALLBACK30-NEXT: orl %eax, %ecx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK30-NEXT: movl %edx, (%eax) ; FALLBACK30-NEXT: movl %esi, 56(%eax) ; FALLBACK30-NEXT: movl %ecx, 60(%eax) ; FALLBACK30-NEXT: movl %ebp, 48(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 52(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 40(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 44(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 32(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 36(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 24(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 28(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 16(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 20(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 8(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 12(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 4(%eax) ; FALLBACK30-NEXT: addl $204, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx ; FALLBACK30-NEXT: popl %ebp ; FALLBACK30-NEXT: vzeroupper ; FALLBACK30-NEXT: retl ; ; FALLBACK31-LABEL: shl_64bytes: ; FALLBACK31: # %bb.0: ; FALLBACK31-NEXT: pushl %ebp ; FALLBACK31-NEXT: pushl %ebx ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi ; FALLBACK31-NEXT: subl $204, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK31-NEXT: vmovups (%ecx), %zmm0 ; FALLBACK31-NEXT: movl (%eax), %ebx ; FALLBACK31-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: leal (,%ebx,8), %ecx ; FALLBACK31-NEXT: andl $24, %ecx ; FALLBACK31-NEXT: andl $60, %ebx ; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: subl %ebx, %eax ; FALLBACK31-NEXT: movl 4(%eax), %esi ; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 8(%eax), %edi ; FALLBACK31-NEXT: movl 12(%eax), %edx ; FALLBACK31-NEXT: movl %edx, %ebp ; FALLBACK31-NEXT: shldl %cl, %edi, %ebp ; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %esi, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 16(%eax), %edi ; FALLBACK31-NEXT: movl 20(%eax), %esi ; FALLBACK31-NEXT: movl %esi, %ebp ; FALLBACK31-NEXT: shldl %cl, %edi, %ebp ; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %edx, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 24(%eax), %edi ; FALLBACK31-NEXT: movl 28(%eax), %edx ; FALLBACK31-NEXT: movl %edx, %ebp ; FALLBACK31-NEXT: shldl %cl, %edi, %ebp ; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %esi, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 32(%eax), %edi ; FALLBACK31-NEXT: movl 36(%eax), %esi ; FALLBACK31-NEXT: movl %esi, %ebp ; FALLBACK31-NEXT: shldl %cl, %edi, %ebp ; FALLBACK31-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %edx, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 40(%eax), %ebp ; FALLBACK31-NEXT: movl 44(%eax), %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %ebp, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shldl %cl, %esi, %ebp ; FALLBACK31-NEXT: movl 56(%eax), %edx ; FALLBACK31-NEXT: movl 60(%eax), %edi ; FALLBACK31-NEXT: shldl %cl, %edx, %edi ; FALLBACK31-NEXT: movl (%eax), %esi ; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 52(%eax), %esi ; FALLBACK31-NEXT: shldl %cl, %esi, %edx ; FALLBACK31-NEXT: negl %ebx ; FALLBACK31-NEXT: movl 176(%esp,%ebx), %ebx ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: movl %edx, 56(%eax) ; FALLBACK31-NEXT: movl %edi, 60(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK31-NEXT: shlxl %ecx, %edx, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK31-NEXT: shldl %cl, %edx, %edi ; FALLBACK31-NEXT: shldl %cl, %ebx, %esi ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK31-NEXT: shldl %cl, %edx, %ebx ; FALLBACK31-NEXT: movl %ebx, 48(%eax) ; FALLBACK31-NEXT: movl %esi, 52(%eax) ; FALLBACK31-NEXT: movl %ebp, 40(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 44(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 32(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 36(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 24(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 28(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 16(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 20(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 8(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 12(%eax) ; FALLBACK31-NEXT: movl %edi, 4(%eax) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, (%eax) ; FALLBACK31-NEXT: addl $204, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi ; FALLBACK31-NEXT: popl %ebx ; FALLBACK31-NEXT: popl %ebp ; FALLBACK31-NEXT: vzeroupper ; FALLBACK31-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 %res = shl i512 %src, %bitOff store i512 %res, ptr %dst, align 1 ret void } define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: shl_64bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rbx ; X64-SSE2-NEXT: movq (%rdi), %rax ; X64-SSE2-NEXT: movq 8(%rdi), %rcx ; X64-SSE2-NEXT: movq 16(%rdi), %r8 ; X64-SSE2-NEXT: movq 24(%rdi), %r9 ; X64-SSE2-NEXT: movq 32(%rdi), %r10 ; X64-SSE2-NEXT: movq 40(%rdi), %r11 ; X64-SSE2-NEXT: movq 48(%rdi), %rbx ; X64-SSE2-NEXT: movq 56(%rdi), %rdi ; X64-SSE2-NEXT: movl (%rsi), %esi ; X64-SSE2-NEXT: xorps %xmm0, %xmm0 ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: shll $3, %esi ; X64-SSE2-NEXT: andl $56, %esi ; X64-SSE2-NEXT: negl %esi ; X64-SSE2-NEXT: movslq %esi, %rax ; X64-SSE2-NEXT: movq -64(%rsp,%rax), %rcx ; X64-SSE2-NEXT: movq -56(%rsp,%rax), %rsi ; X64-SSE2-NEXT: movq -40(%rsp,%rax), %rdi ; X64-SSE2-NEXT: movq -48(%rsp,%rax), %r8 ; X64-SSE2-NEXT: movq -24(%rsp,%rax), %r9 ; X64-SSE2-NEXT: movq -32(%rsp,%rax), %r10 ; X64-SSE2-NEXT: movq -8(%rsp,%rax), %r11 ; X64-SSE2-NEXT: movq -16(%rsp,%rax), %rax ; X64-SSE2-NEXT: movq %rax, 48(%rdx) ; X64-SSE2-NEXT: movq %r11, 56(%rdx) ; X64-SSE2-NEXT: movq %r10, 32(%rdx) ; X64-SSE2-NEXT: movq %r9, 40(%rdx) ; X64-SSE2-NEXT: movq %r8, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) ; X64-SSE2-NEXT: movq %rcx, (%rdx) ; X64-SSE2-NEXT: movq %rsi, 8(%rdx) ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: shl_64bytes_qwordOff: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: pushq %rax ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2 ; X64-SSE42-NEXT: movups 48(%rdi), %xmm3 ; X64-SSE42-NEXT: movl (%rsi), %eax ; X64-SSE42-NEXT: xorps %xmm4, %xmm4 ; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm4, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: shll $3, %eax ; X64-SSE42-NEXT: andl $56, %eax ; X64-SSE42-NEXT: negl %eax ; X64-SSE42-NEXT: cltq ; X64-SSE42-NEXT: movups -64(%rsp,%rax), %xmm0 ; X64-SSE42-NEXT: movups -48(%rsp,%rax), %xmm1 ; X64-SSE42-NEXT: movups -32(%rsp,%rax), %xmm2 ; X64-SSE42-NEXT: movups -16(%rsp,%rax), %xmm3 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: popq %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX1-LABEL: shl_64bytes_qwordOff: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: pushq %rax ; X64-AVX1-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1 ; X64-AVX1-NEXT: movl (%rsi), %eax ; X64-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX1-NEXT: shll $3, %eax ; X64-AVX1-NEXT: andl $56, %eax ; X64-AVX1-NEXT: negl %eax ; X64-AVX1-NEXT: cltq ; X64-AVX1-NEXT: vmovups -64(%rsp,%rax), %xmm0 ; X64-AVX1-NEXT: vmovups -48(%rsp,%rax), %xmm1 ; X64-AVX1-NEXT: vmovups -32(%rsp,%rax), %xmm2 ; X64-AVX1-NEXT: vmovups -16(%rsp,%rax), %xmm3 ; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx) ; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX1-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX1-NEXT: popq %rax ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; ; X64-AVX512-LABEL: shl_64bytes_qwordOff: ; X64-AVX512: # %bb.0: ; X64-AVX512-NEXT: pushq %rax ; X64-AVX512-NEXT: vmovups (%rdi), %zmm0 ; X64-AVX512-NEXT: movl (%rsi), %eax ; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; X64-AVX512-NEXT: shll $3, %eax ; X64-AVX512-NEXT: andl $56, %eax ; X64-AVX512-NEXT: negl %eax ; X64-AVX512-NEXT: cltq ; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0 ; X64-AVX512-NEXT: vmovups -48(%rsp,%rax), %xmm1 ; X64-AVX512-NEXT: vmovups -32(%rsp,%rax), %xmm2 ; X64-AVX512-NEXT: vmovups -16(%rsp,%rax), %xmm3 ; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx) ; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX512-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX512-NEXT: popq %rax ; X64-AVX512-NEXT: vzeroupper ; X64-AVX512-NEXT: retq ; ; X86-SSE2-LABEL: shl_64bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $188, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl (%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 12(%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 16(%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 20(%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 24(%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 28(%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 32(%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 36(%ecx), %eax ; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 40(%ecx), %ebp ; X86-SSE2-NEXT: movl 44(%ecx), %ebx ; X86-SSE2-NEXT: movl 48(%ecx), %edi ; X86-SSE2-NEXT: movl 52(%ecx), %esi ; X86-SSE2-NEXT: movl 56(%ecx), %edx ; X86-SSE2-NEXT: movl 60(%ecx), %eax ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE2-NEXT: movl (%ecx), %ecx ; X86-SSE2-NEXT: xorps %xmm0, %xmm0 ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SSE2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: shll $3, %ecx ; X86-SSE2-NEXT: andl $56, %ecx ; X86-SSE2-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: subl %ecx, %eax ; X86-SSE2-NEXT: movl (%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 12(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 20(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 16(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 28(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 24(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 36(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 32(%eax), %edx ; X86-SSE2-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 44(%eax), %ebp ; X86-SSE2-NEXT: movl 40(%eax), %ebx ; X86-SSE2-NEXT: movl 52(%eax), %edi ; X86-SSE2-NEXT: movl 60(%eax), %esi ; X86-SSE2-NEXT: movl 56(%eax), %edx ; X86-SSE2-NEXT: negl %ecx ; X86-SSE2-NEXT: movl 160(%esp,%ecx), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %edx, 56(%eax) ; X86-SSE2-NEXT: movl %esi, 60(%eax) ; X86-SSE2-NEXT: movl %ecx, 48(%eax) ; X86-SSE2-NEXT: movl %edi, 52(%eax) ; X86-SSE2-NEXT: movl %ebx, 40(%eax) ; X86-SSE2-NEXT: movl %ebp, 44(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 16(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 20(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: addl $188, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: shl_64bytes_qwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: subl $140, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movups 32(%edx), %xmm2 ; X86-SSE42-NEXT: movups 48(%edx), %xmm3 ; X86-SSE42-NEXT: movl (%ecx), %ecx ; X86-SSE42-NEXT: xorps %xmm4, %xmm4 ; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm4, (%esp) ; X86-SSE42-NEXT: movaps %xmm3, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: shll $3, %ecx ; X86-SSE42-NEXT: andl $56, %ecx ; X86-SSE42-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: subl %ecx, %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movups 32(%edx), %xmm2 ; X86-SSE42-NEXT: negl %ecx ; X86-SSE42-NEXT: movups 112(%esp,%ecx), %xmm3 ; X86-SSE42-NEXT: movups %xmm3, 48(%eax) ; X86-SSE42-NEXT: movups %xmm2, 32(%eax) ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $140, %esp ; X86-SSE42-NEXT: retl ; ; X86-AVX1-LABEL: shl_64bytes_qwordOff: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: subl $140, %esp ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: vmovups (%edx), %ymm0 ; X86-AVX1-NEXT: vmovups 32(%edx), %ymm1 ; X86-AVX1-NEXT: movl (%ecx), %ecx ; X86-AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X86-AVX1-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm2, (%esp) ; X86-AVX1-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: shll $3, %ecx ; X86-AVX1-NEXT: andl $56, %ecx ; X86-AVX1-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-AVX1-NEXT: subl %ecx, %edx ; X86-AVX1-NEXT: vmovups (%edx), %xmm0 ; X86-AVX1-NEXT: vmovups 16(%edx), %xmm1 ; X86-AVX1-NEXT: vmovups 32(%edx), %xmm2 ; X86-AVX1-NEXT: negl %ecx ; X86-AVX1-NEXT: vmovups 112(%esp,%ecx), %xmm3 ; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax) ; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX1-NEXT: vmovups %xmm0, (%eax) ; X86-AVX1-NEXT: addl $140, %esp ; X86-AVX1-NEXT: vzeroupper ; X86-AVX1-NEXT: retl ; ; X86-AVX512-LABEL: shl_64bytes_qwordOff: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: subl $140, %esp ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX512-NEXT: vmovups (%edx), %zmm0 ; X86-AVX512-NEXT: movl (%ecx), %ecx ; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX512-NEXT: vmovups %zmm1, (%esp) ; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; X86-AVX512-NEXT: shll $3, %ecx ; X86-AVX512-NEXT: andl $56, %ecx ; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx ; X86-AVX512-NEXT: subl %ecx, %edx ; X86-AVX512-NEXT: vmovups (%edx), %xmm0 ; X86-AVX512-NEXT: vmovups 16(%edx), %xmm1 ; X86-AVX512-NEXT: vmovups 32(%edx), %xmm2 ; X86-AVX512-NEXT: negl %ecx ; X86-AVX512-NEXT: vmovups 112(%esp,%ecx), %xmm3 ; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax) ; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX512-NEXT: vmovups %xmm0, (%eax) ; X86-AVX512-NEXT: addl $140, %esp ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %qwordOff = load i512, ptr %qwordOff.ptr, align 1 %bitOff = shl i512 %qwordOff, 6 %res = shl i512 %src, %bitOff store i512 %res, ptr %dst, align 1 ret void } define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-LABEL: ashr_64bytes: ; FALLBACK0: # %bb.0: ; FALLBACK0-NEXT: pushq %r15 ; FALLBACK0-NEXT: pushq %r14 ; FALLBACK0-NEXT: pushq %r13 ; FALLBACK0-NEXT: pushq %r12 ; FALLBACK0-NEXT: pushq %rbx ; FALLBACK0-NEXT: movq (%rdi), %rax ; FALLBACK0-NEXT: movq 8(%rdi), %rcx ; FALLBACK0-NEXT: movq 16(%rdi), %r8 ; FALLBACK0-NEXT: movq 24(%rdi), %r9 ; FALLBACK0-NEXT: movq 32(%rdi), %r10 ; FALLBACK0-NEXT: movq 40(%rdi), %r11 ; FALLBACK0-NEXT: movq 48(%rdi), %rbx ; FALLBACK0-NEXT: movq 56(%rdi), %r14 ; FALLBACK0-NEXT: movl (%rsi), %edi ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: sarq $63, %r14 ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: leal (,%rdi,8), %eax ; FALLBACK0-NEXT: andl $56, %eax ; FALLBACK0-NEXT: andl $56, %edi ; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 ; FALLBACK0-NEXT: movq %r8, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx ; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r9 ; FALLBACK0-NEXT: orq %r11, %r9 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r10 ; FALLBACK0-NEXT: addq %r8, %r8 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r8 ; FALLBACK0-NEXT: orq %r10, %r8 ; FALLBACK0-NEXT: movq -104(%rsp,%rdi), %r10 ; FALLBACK0-NEXT: movq %r10, %r15 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r15 ; FALLBACK0-NEXT: movq -96(%rsp,%rdi), %r14 ; FALLBACK0-NEXT: leaq (%r14,%r14), %r11 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r11 ; FALLBACK0-NEXT: orq %r15, %r11 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %rbx ; FALLBACK0-NEXT: addq %r10, %r10 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: orq %rbx, %r10 ; FALLBACK0-NEXT: movq -88(%rsp,%rdi), %rbx ; FALLBACK0-NEXT: movq %rbx, %r12 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r12 ; FALLBACK0-NEXT: movq -80(%rsp,%rdi), %r13 ; FALLBACK0-NEXT: leaq (%r13,%r13), %r15 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r15 ; FALLBACK0-NEXT: orq %r12, %r15 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r14 ; FALLBACK0-NEXT: addq %rbx, %rbx ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %rbx ; FALLBACK0-NEXT: orq %r14, %rbx ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r13 ; FALLBACK0-NEXT: movq -72(%rsp,%rdi), %rdi ; FALLBACK0-NEXT: leaq (%rdi,%rdi), %r14 ; FALLBACK0-NEXT: movl %esi, %ecx ; FALLBACK0-NEXT: shlq %cl, %r14 ; FALLBACK0-NEXT: orq %r13, %r14 ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: sarq %cl, %rdi ; FALLBACK0-NEXT: movq %rdi, 56(%rdx) ; FALLBACK0-NEXT: movq %r14, 48(%rdx) ; FALLBACK0-NEXT: movq %rbx, 32(%rdx) ; FALLBACK0-NEXT: movq %r15, 40(%rdx) ; FALLBACK0-NEXT: movq %r10, 16(%rdx) ; FALLBACK0-NEXT: movq %r11, 24(%rdx) ; FALLBACK0-NEXT: movq %r8, (%rdx) ; FALLBACK0-NEXT: movq %r9, 8(%rdx) ; FALLBACK0-NEXT: popq %rbx ; FALLBACK0-NEXT: popq %r12 ; FALLBACK0-NEXT: popq %r13 ; FALLBACK0-NEXT: popq %r14 ; FALLBACK0-NEXT: popq %r15 ; FALLBACK0-NEXT: retq ; ; FALLBACK1-LABEL: ashr_64bytes: ; FALLBACK1: # %bb.0: ; FALLBACK1-NEXT: pushq %r15 ; FALLBACK1-NEXT: pushq %r14 ; FALLBACK1-NEXT: pushq %rbx ; FALLBACK1-NEXT: movq (%rdi), %rcx ; FALLBACK1-NEXT: movq 8(%rdi), %r8 ; FALLBACK1-NEXT: movq 16(%rdi), %r9 ; FALLBACK1-NEXT: movq 24(%rdi), %r10 ; FALLBACK1-NEXT: movq 32(%rdi), %r11 ; FALLBACK1-NEXT: movq 40(%rdi), %rbx ; FALLBACK1-NEXT: movq 48(%rdi), %r14 ; FALLBACK1-NEXT: movq 56(%rdi), %rdi ; FALLBACK1-NEXT: movl (%rsi), %eax ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: sarq $63, %rdi ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: leal (,%rax,8), %ecx ; FALLBACK1-NEXT: andl $56, %ecx ; FALLBACK1-NEXT: andl $56, %eax ; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi ; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi ; FALLBACK1-NEXT: movq -120(%rsp,%rax), %r9 ; FALLBACK1-NEXT: movq %r9, %r8 ; FALLBACK1-NEXT: shrdq %cl, %rdi, %r8 ; FALLBACK1-NEXT: movq -96(%rsp,%rax), %r10 ; FALLBACK1-NEXT: movq -104(%rsp,%rax), %r11 ; FALLBACK1-NEXT: movq %r11, %rbx ; FALLBACK1-NEXT: shrdq %cl, %r10, %rbx ; FALLBACK1-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK1-NEXT: movq -80(%rsp,%rax), %r11 ; FALLBACK1-NEXT: movq -88(%rsp,%rax), %r14 ; FALLBACK1-NEXT: movq %r14, %r15 ; FALLBACK1-NEXT: shrdq %cl, %r11, %r15 ; FALLBACK1-NEXT: shrdq %cl, %r14, %r10 ; FALLBACK1-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK1-NEXT: shrdq %cl, %rax, %r11 ; FALLBACK1-NEXT: shrdq %cl, %r9, %rsi ; FALLBACK1-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK1-NEXT: sarq %cl, %rax ; FALLBACK1-NEXT: movq %r11, 48(%rdx) ; FALLBACK1-NEXT: movq %rax, 56(%rdx) ; FALLBACK1-NEXT: movq %r10, 32(%rdx) ; FALLBACK1-NEXT: movq %r15, 40(%rdx) ; FALLBACK1-NEXT: movq %rdi, 16(%rdx) ; FALLBACK1-NEXT: movq %rbx, 24(%rdx) ; FALLBACK1-NEXT: movq %rsi, (%rdx) ; FALLBACK1-NEXT: movq %r8, 8(%rdx) ; FALLBACK1-NEXT: popq %rbx ; FALLBACK1-NEXT: popq %r14 ; FALLBACK1-NEXT: popq %r15 ; FALLBACK1-NEXT: retq ; ; FALLBACK2-LABEL: ashr_64bytes: ; FALLBACK2: # %bb.0: ; FALLBACK2-NEXT: pushq %rbp ; FALLBACK2-NEXT: pushq %r15 ; FALLBACK2-NEXT: pushq %r14 ; FALLBACK2-NEXT: pushq %r13 ; FALLBACK2-NEXT: pushq %r12 ; FALLBACK2-NEXT: pushq %rbx ; FALLBACK2-NEXT: pushq %rax ; FALLBACK2-NEXT: movq (%rdi), %rcx ; FALLBACK2-NEXT: movq 8(%rdi), %r8 ; FALLBACK2-NEXT: movq 16(%rdi), %r9 ; FALLBACK2-NEXT: movq 24(%rdi), %r10 ; FALLBACK2-NEXT: movq 32(%rdi), %r11 ; FALLBACK2-NEXT: movq 40(%rdi), %rbx ; FALLBACK2-NEXT: movq 48(%rdi), %r14 ; FALLBACK2-NEXT: movq 56(%rdi), %rdi ; FALLBACK2-NEXT: movl (%rsi), %eax ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: sarq $63, %rdi ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx ; FALLBACK2-NEXT: andl $56, %ecx ; FALLBACK2-NEXT: andl $56, %eax ; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi ; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 ; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx ; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 ; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi ; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 ; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 ; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 ; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 ; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 ; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp ; FALLBACK2-NEXT: movl %ecx, %r12d ; FALLBACK2-NEXT: notb %r12b ; FALLBACK2-NEXT: addq %r9, %r9 ; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 ; FALLBACK2-NEXT: orq %rbx, %r9 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi ; FALLBACK2-NEXT: orq %r13, %rdi ; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx ; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 ; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx ; FALLBACK2-NEXT: addq %r10, %r10 ; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 ; FALLBACK2-NEXT: orq %r8, %r10 ; FALLBACK2-NEXT: addq %rsi, %rsi ; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi ; FALLBACK2-NEXT: orq %r11, %rsi ; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 ; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 ; FALLBACK2-NEXT: orq %r15, %r8 ; FALLBACK2-NEXT: addq %r14, %r14 ; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 ; FALLBACK2-NEXT: orq %rbp, %r11 ; FALLBACK2-NEXT: addq %rax, %rax ; FALLBACK2-NEXT: shlxq %r12, %rax, %rax ; FALLBACK2-NEXT: orq %r13, %rax ; FALLBACK2-NEXT: movq %rcx, 56(%rdx) ; FALLBACK2-NEXT: movq %rax, 48(%rdx) ; FALLBACK2-NEXT: movq %r11, 32(%rdx) ; FALLBACK2-NEXT: movq %r8, 40(%rdx) ; FALLBACK2-NEXT: movq %rsi, 16(%rdx) ; FALLBACK2-NEXT: movq %r10, 24(%rdx) ; FALLBACK2-NEXT: movq %rdi, (%rdx) ; FALLBACK2-NEXT: movq %r9, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 ; FALLBACK2-NEXT: popq %r13 ; FALLBACK2-NEXT: popq %r14 ; FALLBACK2-NEXT: popq %r15 ; FALLBACK2-NEXT: popq %rbp ; FALLBACK2-NEXT: retq ; ; FALLBACK3-LABEL: ashr_64bytes: ; FALLBACK3: # %bb.0: ; FALLBACK3-NEXT: pushq %r15 ; FALLBACK3-NEXT: pushq %r14 ; FALLBACK3-NEXT: pushq %rbx ; FALLBACK3-NEXT: movq (%rdi), %rcx ; FALLBACK3-NEXT: movq 8(%rdi), %r8 ; FALLBACK3-NEXT: movq 16(%rdi), %r9 ; FALLBACK3-NEXT: movq 24(%rdi), %r10 ; FALLBACK3-NEXT: movq 32(%rdi), %r11 ; FALLBACK3-NEXT: movq 40(%rdi), %rbx ; FALLBACK3-NEXT: movq 48(%rdi), %r14 ; FALLBACK3-NEXT: movq 56(%rdi), %rdi ; FALLBACK3-NEXT: movl (%rsi), %eax ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: sarq $63, %rdi ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: leal (,%rax,8), %ecx ; FALLBACK3-NEXT: andl $56, %ecx ; FALLBACK3-NEXT: andl $56, %eax ; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi ; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi ; FALLBACK3-NEXT: movq -120(%rsp,%rax), %r9 ; FALLBACK3-NEXT: movq %r9, %r8 ; FALLBACK3-NEXT: shrdq %cl, %rdi, %r8 ; FALLBACK3-NEXT: movq -96(%rsp,%rax), %r10 ; FALLBACK3-NEXT: movq -104(%rsp,%rax), %r11 ; FALLBACK3-NEXT: movq %r11, %rbx ; FALLBACK3-NEXT: shrdq %cl, %r10, %rbx ; FALLBACK3-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK3-NEXT: movq -80(%rsp,%rax), %r11 ; FALLBACK3-NEXT: movq -88(%rsp,%rax), %r14 ; FALLBACK3-NEXT: movq %r14, %r15 ; FALLBACK3-NEXT: shrdq %cl, %r11, %r15 ; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 ; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 ; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax ; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi ; FALLBACK3-NEXT: movq %r11, 48(%rdx) ; FALLBACK3-NEXT: movq %r10, 32(%rdx) ; FALLBACK3-NEXT: movq %r15, 40(%rdx) ; FALLBACK3-NEXT: movq %rdi, 16(%rdx) ; FALLBACK3-NEXT: movq %rbx, 24(%rdx) ; FALLBACK3-NEXT: movq %rsi, (%rdx) ; FALLBACK3-NEXT: movq %r8, 8(%rdx) ; FALLBACK3-NEXT: movq %rax, 56(%rdx) ; FALLBACK3-NEXT: popq %rbx ; FALLBACK3-NEXT: popq %r14 ; FALLBACK3-NEXT: popq %r15 ; FALLBACK3-NEXT: retq ; ; FALLBACK4-LABEL: ashr_64bytes: ; FALLBACK4: # %bb.0: ; FALLBACK4-NEXT: pushq %rbp ; FALLBACK4-NEXT: pushq %r15 ; FALLBACK4-NEXT: pushq %r14 ; FALLBACK4-NEXT: pushq %r13 ; FALLBACK4-NEXT: pushq %r12 ; FALLBACK4-NEXT: pushq %rbx ; FALLBACK4-NEXT: pushq %rax ; FALLBACK4-NEXT: movups (%rdi), %xmm0 ; FALLBACK4-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK4-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK4-NEXT: movq 48(%rdi), %rax ; FALLBACK4-NEXT: movq 56(%rdi), %rcx ; FALLBACK4-NEXT: movl (%rsi), %edi ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: sarq $63, %rcx ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: leal (,%rdi,8), %eax ; FALLBACK4-NEXT: andl $56, %eax ; FALLBACK4-NEXT: andl $56, %edi ; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r9 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r9,%r9), %r8 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r8 ; FALLBACK4-NEXT: orq %r10, %r8 ; FALLBACK4-NEXT: movq -104(%rsp,%rdi), %r10 ; FALLBACK4-NEXT: movq %r10, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %rbx ; FALLBACK4-NEXT: movq -96(%rsp,%rdi), %r12 ; FALLBACK4-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r11 ; FALLBACK4-NEXT: orq %rbx, %r11 ; FALLBACK4-NEXT: movq -112(%rsp,%rdi), %rbx ; FALLBACK4-NEXT: movq %rbx, %r14 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r14 ; FALLBACK4-NEXT: addq %r10, %r10 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: orq %r14, %r10 ; FALLBACK4-NEXT: movq -88(%rsp,%rdi), %r14 ; FALLBACK4-NEXT: movq %r14, %r13 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r13 ; FALLBACK4-NEXT: movq -80(%rsp,%rdi), %rbp ; FALLBACK4-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r15 ; FALLBACK4-NEXT: orq %r13, %r15 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r12 ; FALLBACK4-NEXT: addq %r14, %r14 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r14 ; FALLBACK4-NEXT: orq %r12, %r14 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %rbp ; FALLBACK4-NEXT: movq -72(%rsp,%rdi), %rdi ; FALLBACK4-NEXT: leaq (%rdi,%rdi), %r12 ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %r12 ; FALLBACK4-NEXT: orq %rbp, %r12 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r9 ; FALLBACK4-NEXT: addq %rbx, %rbx ; FALLBACK4-NEXT: movl %esi, %ecx ; FALLBACK4-NEXT: shlq %cl, %rbx ; FALLBACK4-NEXT: orq %r9, %rbx ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: sarq %cl, %rdi ; FALLBACK4-NEXT: movq %rdi, 56(%rdx) ; FALLBACK4-NEXT: movq %rbx, 8(%rdx) ; FALLBACK4-NEXT: movq %r12, 48(%rdx) ; FALLBACK4-NEXT: movq %r14, 32(%rdx) ; FALLBACK4-NEXT: movq %r15, 40(%rdx) ; FALLBACK4-NEXT: movq %r10, 16(%rdx) ; FALLBACK4-NEXT: movq %r11, 24(%rdx) ; FALLBACK4-NEXT: movq %r8, (%rdx) ; FALLBACK4-NEXT: addq $8, %rsp ; FALLBACK4-NEXT: popq %rbx ; FALLBACK4-NEXT: popq %r12 ; FALLBACK4-NEXT: popq %r13 ; FALLBACK4-NEXT: popq %r14 ; FALLBACK4-NEXT: popq %r15 ; FALLBACK4-NEXT: popq %rbp ; FALLBACK4-NEXT: retq ; ; FALLBACK5-LABEL: ashr_64bytes: ; FALLBACK5: # %bb.0: ; FALLBACK5-NEXT: pushq %r15 ; FALLBACK5-NEXT: pushq %r14 ; FALLBACK5-NEXT: pushq %rbx ; FALLBACK5-NEXT: movups (%rdi), %xmm0 ; FALLBACK5-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK5-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK5-NEXT: movq 48(%rdi), %rcx ; FALLBACK5-NEXT: movq 56(%rdi), %rdi ; FALLBACK5-NEXT: movl (%rsi), %eax ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: sarq $63, %rdi ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: leal (,%rax,8), %ecx ; FALLBACK5-NEXT: andl $56, %ecx ; FALLBACK5-NEXT: andl $56, %eax ; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK5-NEXT: movq %r9, %rsi ; FALLBACK5-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK5-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK5-NEXT: movq %r10, %r8 ; FALLBACK5-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK5-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK5-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK5-NEXT: movq %r11, %rbx ; FALLBACK5-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK5-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK5-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK5-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK5-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK5-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK5-NEXT: movq %rax, %r15 ; FALLBACK5-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK5-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK5-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK5-NEXT: sarq %cl, %r11 ; FALLBACK5-NEXT: movq %r15, 8(%rdx) ; FALLBACK5-NEXT: movq %r9, 48(%rdx) ; FALLBACK5-NEXT: movq %r11, 56(%rdx) ; FALLBACK5-NEXT: movq %rdi, 32(%rdx) ; FALLBACK5-NEXT: movq %rbx, 40(%rdx) ; FALLBACK5-NEXT: movq %r8, 16(%rdx) ; FALLBACK5-NEXT: movq %rsi, 24(%rdx) ; FALLBACK5-NEXT: movq %r14, (%rdx) ; FALLBACK5-NEXT: popq %rbx ; FALLBACK5-NEXT: popq %r14 ; FALLBACK5-NEXT: popq %r15 ; FALLBACK5-NEXT: retq ; ; FALLBACK6-LABEL: ashr_64bytes: ; FALLBACK6: # %bb.0: ; FALLBACK6-NEXT: pushq %rbp ; FALLBACK6-NEXT: pushq %r15 ; FALLBACK6-NEXT: pushq %r14 ; FALLBACK6-NEXT: pushq %r13 ; FALLBACK6-NEXT: pushq %r12 ; FALLBACK6-NEXT: pushq %rbx ; FALLBACK6-NEXT: pushq %rax ; FALLBACK6-NEXT: movups (%rdi), %xmm0 ; FALLBACK6-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK6-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK6-NEXT: movq 48(%rdi), %rcx ; FALLBACK6-NEXT: movq 56(%rdi), %rdi ; FALLBACK6-NEXT: movl (%rsi), %eax ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: sarq $63, %rdi ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: leal (,%rax,8), %esi ; FALLBACK6-NEXT: andl $56, %esi ; FALLBACK6-NEXT: andl $56, %eax ; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 ; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 ; FALLBACK6-NEXT: movq -96(%rsp,%rax), %r13 ; FALLBACK6-NEXT: shrxq %rsi, %rcx, %r9 ; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 ; FALLBACK6-NEXT: movl %esi, %ebx ; FALLBACK6-NEXT: notb %bl ; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 ; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK6-NEXT: orq %r11, %r8 ; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 ; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK6-NEXT: orq %r12, %r11 ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi ; FALLBACK6-NEXT: addq %rdi, %rdi ; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi ; FALLBACK6-NEXT: orq %r9, %rdi ; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 ; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 ; FALLBACK6-NEXT: orq %r14, %r9 ; FALLBACK6-NEXT: addq %r10, %r10 ; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 ; FALLBACK6-NEXT: orq %r15, %r10 ; FALLBACK6-NEXT: addq %rax, %rax ; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax ; FALLBACK6-NEXT: orq %r13, %rax ; FALLBACK6-NEXT: addq %rcx, %rcx ; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx ; FALLBACK6-NEXT: orq %rbp, %rcx ; FALLBACK6-NEXT: movq %rsi, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) ; FALLBACK6-NEXT: movq %rax, 48(%rdx) ; FALLBACK6-NEXT: movq %r10, 32(%rdx) ; FALLBACK6-NEXT: movq %r9, 40(%rdx) ; FALLBACK6-NEXT: movq %rdi, 16(%rdx) ; FALLBACK6-NEXT: movq %r11, 24(%rdx) ; FALLBACK6-NEXT: movq %r8, (%rdx) ; FALLBACK6-NEXT: addq $8, %rsp ; FALLBACK6-NEXT: popq %rbx ; FALLBACK6-NEXT: popq %r12 ; FALLBACK6-NEXT: popq %r13 ; FALLBACK6-NEXT: popq %r14 ; FALLBACK6-NEXT: popq %r15 ; FALLBACK6-NEXT: popq %rbp ; FALLBACK6-NEXT: retq ; ; FALLBACK7-LABEL: ashr_64bytes: ; FALLBACK7: # %bb.0: ; FALLBACK7-NEXT: pushq %r15 ; FALLBACK7-NEXT: pushq %r14 ; FALLBACK7-NEXT: pushq %rbx ; FALLBACK7-NEXT: movups (%rdi), %xmm0 ; FALLBACK7-NEXT: movups 16(%rdi), %xmm1 ; FALLBACK7-NEXT: movups 32(%rdi), %xmm2 ; FALLBACK7-NEXT: movq 48(%rdi), %rcx ; FALLBACK7-NEXT: movq 56(%rdi), %rdi ; FALLBACK7-NEXT: movl (%rsi), %eax ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: sarq $63, %rdi ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: leal (,%rax,8), %ecx ; FALLBACK7-NEXT: andl $56, %ecx ; FALLBACK7-NEXT: andl $56, %eax ; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK7-NEXT: movq %r9, %rsi ; FALLBACK7-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK7-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK7-NEXT: movq %r10, %r8 ; FALLBACK7-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK7-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK7-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK7-NEXT: movq %r11, %rbx ; FALLBACK7-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK7-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK7-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK7-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK7-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK7-NEXT: movq %rax, %r15 ; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK7-NEXT: sarxq %rcx, %r11, %r10 ; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK7-NEXT: movq %r15, 8(%rdx) ; FALLBACK7-NEXT: movq %r9, 48(%rdx) ; FALLBACK7-NEXT: movq %rdi, 32(%rdx) ; FALLBACK7-NEXT: movq %rbx, 40(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rsi, 24(%rdx) ; FALLBACK7-NEXT: movq %r14, (%rdx) ; FALLBACK7-NEXT: movq %r10, 56(%rdx) ; FALLBACK7-NEXT: popq %rbx ; FALLBACK7-NEXT: popq %r14 ; FALLBACK7-NEXT: popq %r15 ; FALLBACK7-NEXT: retq ; ; FALLBACK8-LABEL: ashr_64bytes: ; FALLBACK8: # %bb.0: ; FALLBACK8-NEXT: pushq %rbp ; FALLBACK8-NEXT: pushq %r15 ; FALLBACK8-NEXT: pushq %r14 ; FALLBACK8-NEXT: pushq %r13 ; FALLBACK8-NEXT: pushq %r12 ; FALLBACK8-NEXT: pushq %rbx ; FALLBACK8-NEXT: pushq %rax ; FALLBACK8-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK8-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK8-NEXT: movq 48(%rdi), %rax ; FALLBACK8-NEXT: movq 56(%rdi), %rcx ; FALLBACK8-NEXT: movl (%rsi), %edi ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: sarq $63, %rcx ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: leal (,%rdi,8), %eax ; FALLBACK8-NEXT: andl $56, %eax ; FALLBACK8-NEXT: andl $56, %edi ; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r9 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r9,%r9), %r8 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r8 ; FALLBACK8-NEXT: orq %r10, %r8 ; FALLBACK8-NEXT: movq -104(%rsp,%rdi), %r10 ; FALLBACK8-NEXT: movq %r10, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %rbx ; FALLBACK8-NEXT: movq -96(%rsp,%rdi), %r12 ; FALLBACK8-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r11 ; FALLBACK8-NEXT: orq %rbx, %r11 ; FALLBACK8-NEXT: movq -112(%rsp,%rdi), %rbx ; FALLBACK8-NEXT: movq %rbx, %r14 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r14 ; FALLBACK8-NEXT: addq %r10, %r10 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: orq %r14, %r10 ; FALLBACK8-NEXT: movq -88(%rsp,%rdi), %r14 ; FALLBACK8-NEXT: movq %r14, %r13 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r13 ; FALLBACK8-NEXT: movq -80(%rsp,%rdi), %rbp ; FALLBACK8-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r15 ; FALLBACK8-NEXT: orq %r13, %r15 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r12 ; FALLBACK8-NEXT: addq %r14, %r14 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r14 ; FALLBACK8-NEXT: orq %r12, %r14 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %rbp ; FALLBACK8-NEXT: movq -72(%rsp,%rdi), %rdi ; FALLBACK8-NEXT: leaq (%rdi,%rdi), %r12 ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %r12 ; FALLBACK8-NEXT: orq %rbp, %r12 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r9 ; FALLBACK8-NEXT: addq %rbx, %rbx ; FALLBACK8-NEXT: movl %esi, %ecx ; FALLBACK8-NEXT: shlq %cl, %rbx ; FALLBACK8-NEXT: orq %r9, %rbx ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: sarq %cl, %rdi ; FALLBACK8-NEXT: movq %rdi, 56(%rdx) ; FALLBACK8-NEXT: movq %rbx, 8(%rdx) ; FALLBACK8-NEXT: movq %r12, 48(%rdx) ; FALLBACK8-NEXT: movq %r14, 32(%rdx) ; FALLBACK8-NEXT: movq %r15, 40(%rdx) ; FALLBACK8-NEXT: movq %r10, 16(%rdx) ; FALLBACK8-NEXT: movq %r11, 24(%rdx) ; FALLBACK8-NEXT: movq %r8, (%rdx) ; FALLBACK8-NEXT: addq $8, %rsp ; FALLBACK8-NEXT: popq %rbx ; FALLBACK8-NEXT: popq %r12 ; FALLBACK8-NEXT: popq %r13 ; FALLBACK8-NEXT: popq %r14 ; FALLBACK8-NEXT: popq %r15 ; FALLBACK8-NEXT: popq %rbp ; FALLBACK8-NEXT: vzeroupper ; FALLBACK8-NEXT: retq ; ; FALLBACK9-LABEL: ashr_64bytes: ; FALLBACK9: # %bb.0: ; FALLBACK9-NEXT: pushq %r15 ; FALLBACK9-NEXT: pushq %r14 ; FALLBACK9-NEXT: pushq %rbx ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK9-NEXT: movq 48(%rdi), %rcx ; FALLBACK9-NEXT: movq 56(%rdi), %rdi ; FALLBACK9-NEXT: movl (%rsi), %eax ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: sarq $63, %rdi ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: leal (,%rax,8), %ecx ; FALLBACK9-NEXT: andl $56, %ecx ; FALLBACK9-NEXT: andl $56, %eax ; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK9-NEXT: movq %r9, %rsi ; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK9-NEXT: movq %r10, %r8 ; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK9-NEXT: movq %r11, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK9-NEXT: movq %rax, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: sarq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) ; FALLBACK9-NEXT: movq %r9, 48(%rdx) ; FALLBACK9-NEXT: movq %r11, 56(%rdx) ; FALLBACK9-NEXT: movq %rdi, 32(%rdx) ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) ; FALLBACK9-NEXT: movq %rsi, 24(%rdx) ; FALLBACK9-NEXT: movq %r14, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 ; FALLBACK9-NEXT: popq %r15 ; FALLBACK9-NEXT: vzeroupper ; FALLBACK9-NEXT: retq ; ; FALLBACK10-LABEL: ashr_64bytes: ; FALLBACK10: # %bb.0: ; FALLBACK10-NEXT: pushq %rbp ; FALLBACK10-NEXT: pushq %r15 ; FALLBACK10-NEXT: pushq %r14 ; FALLBACK10-NEXT: pushq %r13 ; FALLBACK10-NEXT: pushq %r12 ; FALLBACK10-NEXT: pushq %rbx ; FALLBACK10-NEXT: pushq %rax ; FALLBACK10-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK10-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK10-NEXT: movq 48(%rdi), %rcx ; FALLBACK10-NEXT: movq 56(%rdi), %rdi ; FALLBACK10-NEXT: movl (%rsi), %eax ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: sarq $63, %rdi ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: leal (,%rax,8), %esi ; FALLBACK10-NEXT: andl $56, %esi ; FALLBACK10-NEXT: andl $56, %eax ; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 ; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 ; FALLBACK10-NEXT: movq -96(%rsp,%rax), %r13 ; FALLBACK10-NEXT: shrxq %rsi, %rcx, %r9 ; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 ; FALLBACK10-NEXT: movl %esi, %ebx ; FALLBACK10-NEXT: notb %bl ; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 ; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK10-NEXT: orq %r11, %r8 ; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 ; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK10-NEXT: orq %r12, %r11 ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi ; FALLBACK10-NEXT: addq %rdi, %rdi ; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi ; FALLBACK10-NEXT: orq %r9, %rdi ; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 ; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 ; FALLBACK10-NEXT: orq %r14, %r9 ; FALLBACK10-NEXT: addq %r10, %r10 ; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 ; FALLBACK10-NEXT: orq %r15, %r10 ; FALLBACK10-NEXT: addq %rax, %rax ; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax ; FALLBACK10-NEXT: orq %r13, %rax ; FALLBACK10-NEXT: addq %rcx, %rcx ; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx ; FALLBACK10-NEXT: orq %rbp, %rcx ; FALLBACK10-NEXT: movq %rsi, 56(%rdx) ; FALLBACK10-NEXT: movq %rcx, 8(%rdx) ; FALLBACK10-NEXT: movq %rax, 48(%rdx) ; FALLBACK10-NEXT: movq %r10, 32(%rdx) ; FALLBACK10-NEXT: movq %r9, 40(%rdx) ; FALLBACK10-NEXT: movq %rdi, 16(%rdx) ; FALLBACK10-NEXT: movq %r11, 24(%rdx) ; FALLBACK10-NEXT: movq %r8, (%rdx) ; FALLBACK10-NEXT: addq $8, %rsp ; FALLBACK10-NEXT: popq %rbx ; FALLBACK10-NEXT: popq %r12 ; FALLBACK10-NEXT: popq %r13 ; FALLBACK10-NEXT: popq %r14 ; FALLBACK10-NEXT: popq %r15 ; FALLBACK10-NEXT: popq %rbp ; FALLBACK10-NEXT: vzeroupper ; FALLBACK10-NEXT: retq ; ; FALLBACK11-LABEL: ashr_64bytes: ; FALLBACK11: # %bb.0: ; FALLBACK11-NEXT: pushq %r15 ; FALLBACK11-NEXT: pushq %r14 ; FALLBACK11-NEXT: pushq %rbx ; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK11-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK11-NEXT: movq 48(%rdi), %rcx ; FALLBACK11-NEXT: movq 56(%rdi), %rdi ; FALLBACK11-NEXT: movl (%rsi), %eax ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: sarq $63, %rdi ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: leal (,%rax,8), %ecx ; FALLBACK11-NEXT: andl $56, %ecx ; FALLBACK11-NEXT: andl $56, %eax ; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK11-NEXT: movq %r9, %rsi ; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK11-NEXT: movq %r10, %r8 ; FALLBACK11-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK11-NEXT: movq %r11, %rbx ; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK11-NEXT: movq %rax, %r15 ; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK11-NEXT: sarxq %rcx, %r11, %r10 ; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK11-NEXT: movq %r15, 8(%rdx) ; FALLBACK11-NEXT: movq %r9, 48(%rdx) ; FALLBACK11-NEXT: movq %rdi, 32(%rdx) ; FALLBACK11-NEXT: movq %rbx, 40(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) ; FALLBACK11-NEXT: movq %rsi, 24(%rdx) ; FALLBACK11-NEXT: movq %r14, (%rdx) ; FALLBACK11-NEXT: movq %r10, 56(%rdx) ; FALLBACK11-NEXT: popq %rbx ; FALLBACK11-NEXT: popq %r14 ; FALLBACK11-NEXT: popq %r15 ; FALLBACK11-NEXT: vzeroupper ; FALLBACK11-NEXT: retq ; ; FALLBACK12-LABEL: ashr_64bytes: ; FALLBACK12: # %bb.0: ; FALLBACK12-NEXT: pushq %rbp ; FALLBACK12-NEXT: pushq %r15 ; FALLBACK12-NEXT: pushq %r14 ; FALLBACK12-NEXT: pushq %r13 ; FALLBACK12-NEXT: pushq %r12 ; FALLBACK12-NEXT: pushq %rbx ; FALLBACK12-NEXT: pushq %rax ; FALLBACK12-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK12-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK12-NEXT: movq 48(%rdi), %rax ; FALLBACK12-NEXT: movq 56(%rdi), %rcx ; FALLBACK12-NEXT: movl (%rsi), %edi ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: sarq $63, %rcx ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: leal (,%rdi,8), %eax ; FALLBACK12-NEXT: andl $56, %eax ; FALLBACK12-NEXT: andl $56, %edi ; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r9 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r9,%r9), %r8 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r8 ; FALLBACK12-NEXT: orq %r10, %r8 ; FALLBACK12-NEXT: movq -104(%rsp,%rdi), %r10 ; FALLBACK12-NEXT: movq %r10, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbx ; FALLBACK12-NEXT: movq -96(%rsp,%rdi), %r12 ; FALLBACK12-NEXT: leaq (%r12,%r12), %r11 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r11 ; FALLBACK12-NEXT: orq %rbx, %r11 ; FALLBACK12-NEXT: movq -112(%rsp,%rdi), %rbx ; FALLBACK12-NEXT: movq %rbx, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r14 ; FALLBACK12-NEXT: addq %r10, %r10 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: orq %r14, %r10 ; FALLBACK12-NEXT: movq -88(%rsp,%rdi), %r14 ; FALLBACK12-NEXT: movq %r14, %r13 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r13 ; FALLBACK12-NEXT: movq -80(%rsp,%rdi), %rbp ; FALLBACK12-NEXT: leaq (%rbp,%rbp), %r15 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r15 ; FALLBACK12-NEXT: orq %r13, %r15 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r12 ; FALLBACK12-NEXT: addq %r14, %r14 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r14 ; FALLBACK12-NEXT: orq %r12, %r14 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %rbp ; FALLBACK12-NEXT: movq -72(%rsp,%rdi), %rdi ; FALLBACK12-NEXT: leaq (%rdi,%rdi), %r12 ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %r12 ; FALLBACK12-NEXT: orq %rbp, %r12 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r9 ; FALLBACK12-NEXT: addq %rbx, %rbx ; FALLBACK12-NEXT: movl %esi, %ecx ; FALLBACK12-NEXT: shlq %cl, %rbx ; FALLBACK12-NEXT: orq %r9, %rbx ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: sarq %cl, %rdi ; FALLBACK12-NEXT: movq %rdi, 56(%rdx) ; FALLBACK12-NEXT: movq %rbx, 8(%rdx) ; FALLBACK12-NEXT: movq %r12, 48(%rdx) ; FALLBACK12-NEXT: movq %r14, 32(%rdx) ; FALLBACK12-NEXT: movq %r15, 40(%rdx) ; FALLBACK12-NEXT: movq %r10, 16(%rdx) ; FALLBACK12-NEXT: movq %r11, 24(%rdx) ; FALLBACK12-NEXT: movq %r8, (%rdx) ; FALLBACK12-NEXT: addq $8, %rsp ; FALLBACK12-NEXT: popq %rbx ; FALLBACK12-NEXT: popq %r12 ; FALLBACK12-NEXT: popq %r13 ; FALLBACK12-NEXT: popq %r14 ; FALLBACK12-NEXT: popq %r15 ; FALLBACK12-NEXT: popq %rbp ; FALLBACK12-NEXT: vzeroupper ; FALLBACK12-NEXT: retq ; ; FALLBACK13-LABEL: ashr_64bytes: ; FALLBACK13: # %bb.0: ; FALLBACK13-NEXT: pushq %r15 ; FALLBACK13-NEXT: pushq %r14 ; FALLBACK13-NEXT: pushq %rbx ; FALLBACK13-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK13-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK13-NEXT: movq 48(%rdi), %rcx ; FALLBACK13-NEXT: movq 56(%rdi), %rdi ; FALLBACK13-NEXT: movl (%rsi), %eax ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: sarq $63, %rdi ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: leal (,%rax,8), %ecx ; FALLBACK13-NEXT: andl $56, %ecx ; FALLBACK13-NEXT: andl $56, %eax ; FALLBACK13-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK13-NEXT: movq %r9, %rsi ; FALLBACK13-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK13-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK13-NEXT: movq %r10, %r8 ; FALLBACK13-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK13-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK13-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK13-NEXT: movq %r11, %rbx ; FALLBACK13-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK13-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK13-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK13-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK13-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK13-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK13-NEXT: movq %rax, %r15 ; FALLBACK13-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK13-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK13-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK13-NEXT: sarq %cl, %r11 ; FALLBACK13-NEXT: movq %r15, 8(%rdx) ; FALLBACK13-NEXT: movq %r9, 48(%rdx) ; FALLBACK13-NEXT: movq %r11, 56(%rdx) ; FALLBACK13-NEXT: movq %rdi, 32(%rdx) ; FALLBACK13-NEXT: movq %rbx, 40(%rdx) ; FALLBACK13-NEXT: movq %r8, 16(%rdx) ; FALLBACK13-NEXT: movq %rsi, 24(%rdx) ; FALLBACK13-NEXT: movq %r14, (%rdx) ; FALLBACK13-NEXT: popq %rbx ; FALLBACK13-NEXT: popq %r14 ; FALLBACK13-NEXT: popq %r15 ; FALLBACK13-NEXT: vzeroupper ; FALLBACK13-NEXT: retq ; ; FALLBACK14-LABEL: ashr_64bytes: ; FALLBACK14: # %bb.0: ; FALLBACK14-NEXT: pushq %rbp ; FALLBACK14-NEXT: pushq %r15 ; FALLBACK14-NEXT: pushq %r14 ; FALLBACK14-NEXT: pushq %r13 ; FALLBACK14-NEXT: pushq %r12 ; FALLBACK14-NEXT: pushq %rbx ; FALLBACK14-NEXT: pushq %rax ; FALLBACK14-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK14-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK14-NEXT: movq 48(%rdi), %rcx ; FALLBACK14-NEXT: movq 56(%rdi), %rdi ; FALLBACK14-NEXT: movl (%rsi), %eax ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: sarq $63, %rdi ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: leal (,%rax,8), %esi ; FALLBACK14-NEXT: andl $56, %esi ; FALLBACK14-NEXT: andl $56, %eax ; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 ; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12 ; FALLBACK14-NEXT: movq -96(%rsp,%rax), %r13 ; FALLBACK14-NEXT: shrxq %rsi, %rcx, %r9 ; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15 ; FALLBACK14-NEXT: movl %esi, %ebx ; FALLBACK14-NEXT: notb %bl ; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 ; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 ; FALLBACK14-NEXT: orq %r11, %r8 ; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 ; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 ; FALLBACK14-NEXT: orq %r12, %r11 ; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi ; FALLBACK14-NEXT: addq %rdi, %rdi ; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi ; FALLBACK14-NEXT: orq %r9, %rdi ; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 ; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 ; FALLBACK14-NEXT: orq %r14, %r9 ; FALLBACK14-NEXT: addq %r10, %r10 ; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 ; FALLBACK14-NEXT: orq %r15, %r10 ; FALLBACK14-NEXT: addq %rax, %rax ; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax ; FALLBACK14-NEXT: orq %r13, %rax ; FALLBACK14-NEXT: addq %rcx, %rcx ; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx ; FALLBACK14-NEXT: orq %rbp, %rcx ; FALLBACK14-NEXT: movq %rsi, 56(%rdx) ; FALLBACK14-NEXT: movq %rcx, 8(%rdx) ; FALLBACK14-NEXT: movq %rax, 48(%rdx) ; FALLBACK14-NEXT: movq %r10, 32(%rdx) ; FALLBACK14-NEXT: movq %r9, 40(%rdx) ; FALLBACK14-NEXT: movq %rdi, 16(%rdx) ; FALLBACK14-NEXT: movq %r11, 24(%rdx) ; FALLBACK14-NEXT: movq %r8, (%rdx) ; FALLBACK14-NEXT: addq $8, %rsp ; FALLBACK14-NEXT: popq %rbx ; FALLBACK14-NEXT: popq %r12 ; FALLBACK14-NEXT: popq %r13 ; FALLBACK14-NEXT: popq %r14 ; FALLBACK14-NEXT: popq %r15 ; FALLBACK14-NEXT: popq %rbp ; FALLBACK14-NEXT: vzeroupper ; FALLBACK14-NEXT: retq ; ; FALLBACK15-LABEL: ashr_64bytes: ; FALLBACK15: # %bb.0: ; FALLBACK15-NEXT: pushq %r15 ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx ; FALLBACK15-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK15-NEXT: vmovups 32(%rdi), %xmm1 ; FALLBACK15-NEXT: movq 48(%rdi), %rcx ; FALLBACK15-NEXT: movq 56(%rdi), %rdi ; FALLBACK15-NEXT: movl (%rsi), %eax ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: sarq $63, %rdi ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: leal (,%rax,8), %ecx ; FALLBACK15-NEXT: andl $56, %ecx ; FALLBACK15-NEXT: andl $56, %eax ; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 ; FALLBACK15-NEXT: movq %r9, %rsi ; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi ; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 ; FALLBACK15-NEXT: movq %r10, %r8 ; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 ; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 ; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 ; FALLBACK15-NEXT: movq %r11, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi ; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 ; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 ; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK15-NEXT: movq %rax, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 ; FALLBACK15-NEXT: sarxq %rcx, %r11, %r10 ; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) ; FALLBACK15-NEXT: movq %rdi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) ; FALLBACK15-NEXT: movq %rsi, 24(%rdx) ; FALLBACK15-NEXT: movq %r14, (%rdx) ; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx ; FALLBACK15-NEXT: popq %r14 ; FALLBACK15-NEXT: popq %r15 ; FALLBACK15-NEXT: vzeroupper ; FALLBACK15-NEXT: retq ; ; FALLBACK16-LABEL: ashr_64bytes: ; FALLBACK16: # %bb.0: ; FALLBACK16-NEXT: pushl %ebp ; FALLBACK16-NEXT: pushl %ebx ; FALLBACK16-NEXT: pushl %edi ; FALLBACK16-NEXT: pushl %esi ; FALLBACK16-NEXT: subl $204, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK16-NEXT: movl (%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 4(%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 8(%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 12(%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 16(%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 20(%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 24(%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 28(%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 32(%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 36(%ecx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 40(%ecx), %ebx ; FALLBACK16-NEXT: movl 44(%ecx), %edi ; FALLBACK16-NEXT: movl 48(%ecx), %esi ; FALLBACK16-NEXT: movl 52(%ecx), %edx ; FALLBACK16-NEXT: movl 56(%ecx), %eax ; FALLBACK16-NEXT: movl 60(%ecx), %ecx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK16-NEXT: movl (%ebp), %ebp ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: sarl $31, %ecx ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebp, %ecx ; FALLBACK16-NEXT: movl %ebp, %esi ; FALLBACK16-NEXT: andl $60, %esi ; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx ; FALLBACK16-NEXT: shll $3, %ecx ; FALLBACK16-NEXT: andl $24, %ecx ; FALLBACK16-NEXT: movl %edx, %eax ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: addl %edi, %edi ; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK16-NEXT: movl %ecx, %ebx ; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: notb %ch ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: orl %eax, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: addl %edx, %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: orl %eax, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp ; FALLBACK16-NEXT: movl %ebp, %edx ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi ; FALLBACK16-NEXT: leal (%edi,%edi), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: orl %edx, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %esi, %edx ; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi ; FALLBACK16-NEXT: leal (%esi,%esi), %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK16-NEXT: addl %ebx, %ebx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %edi, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %edx, %eax ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp ; FALLBACK16-NEXT: movl %ebp, %edx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi ; FALLBACK16-NEXT: leal (%edi,%edi), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: orl %edx, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %esi, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi ; FALLBACK16-NEXT: leal (%esi,%esi), %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %ebx, %edx ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK16-NEXT: addl %ebx, %ebx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: orl %edi, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi ; FALLBACK16-NEXT: movl %edi, %eax ; FALLBACK16-NEXT: movl %edx, %ebx ; FALLBACK16-NEXT: movl %ebx, %ecx ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %ebp, %edx ; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: addl %edi, %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: orl %esi, %edi ; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi ; FALLBACK16-NEXT: movl %esi, %eax ; FALLBACK16-NEXT: movl %ebx, %ecx ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx ; FALLBACK16-NEXT: leal (%edx,%edx), %ebp ; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %esi ; FALLBACK16-NEXT: orl %eax, %esi ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: movl %edx, %eax ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx ; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx ; FALLBACK16-NEXT: orl %eax, %edx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK16-NEXT: sarl %cl, %ebx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl %ebx, 60(%eax) ; FALLBACK16-NEXT: movl %edx, 56(%eax) ; FALLBACK16-NEXT: movl %esi, 48(%eax) ; FALLBACK16-NEXT: movl %ebp, 52(%eax) ; FALLBACK16-NEXT: movl %edi, 40(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 44(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 32(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 36(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 24(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 28(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 16(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 20(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 8(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 12(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, (%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 4(%eax) ; FALLBACK16-NEXT: addl $204, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi ; FALLBACK16-NEXT: popl %ebx ; FALLBACK16-NEXT: popl %ebp ; FALLBACK16-NEXT: retl ; ; FALLBACK17-LABEL: ashr_64bytes: ; FALLBACK17: # %bb.0: ; FALLBACK17-NEXT: pushl %ebp ; FALLBACK17-NEXT: pushl %ebx ; FALLBACK17-NEXT: pushl %edi ; FALLBACK17-NEXT: pushl %esi ; FALLBACK17-NEXT: subl $188, %esp ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK17-NEXT: movl (%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 4(%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 8(%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 12(%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 16(%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 20(%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 24(%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 28(%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 32(%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 36(%eax), %ecx ; FALLBACK17-NEXT: movl %ecx, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: movl 40(%eax), %ebp ; FALLBACK17-NEXT: movl 44(%eax), %ebx ; FALLBACK17-NEXT: movl 48(%eax), %edi ; FALLBACK17-NEXT: movl 52(%eax), %esi ; FALLBACK17-NEXT: movl 56(%eax), %edx ; FALLBACK17-NEXT: movl 60(%eax), %eax ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK17-NEXT: movl (%ecx), %ecx ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl (%esp), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: sarl $31, %eax ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %ecx, %ebp ; FALLBACK17-NEXT: andl $60, %ebp ; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shll $3, %ecx ; FALLBACK17-NEXT: andl $24, %ecx ; FALLBACK17-NEXT: shrdl %cl, %edx, %eax ; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %esi ; FALLBACK17-NEXT: shrdl %cl, %edi, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %edi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi ; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl %esi, %edx ; FALLBACK17-NEXT: shrdl %cl, %eax, %edi ; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill ; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edi ; FALLBACK17-NEXT: shrdl %cl, %esi, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx ; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax ; FALLBACK17-NEXT: movl %eax, %edi ; FALLBACK17-NEXT: shrdl %cl, %edx, %edi ; FALLBACK17-NEXT: shrdl %cl, %eax, %esi ; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx ; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax ; FALLBACK17-NEXT: shrdl %cl, %eax, %edx ; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK17-NEXT: movl %edx, 56(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK17-NEXT: sarl %cl, %eax ; FALLBACK17-NEXT: movl %eax, 60(%ebp) ; FALLBACK17-NEXT: movl %esi, 48(%ebp) ; FALLBACK17-NEXT: movl %edi, 52(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 40(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 44(%ebp) ; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 32(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 36(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 24(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 28(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 16(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 20(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 8(%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 12(%ebp) ; FALLBACK17-NEXT: movl %ebx, (%ebp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, 4(%ebp) ; FALLBACK17-NEXT: addl $188, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi ; FALLBACK17-NEXT: popl %ebx ; FALLBACK17-NEXT: popl %ebp ; FALLBACK17-NEXT: retl ; ; FALLBACK18-LABEL: ashr_64bytes: ; FALLBACK18: # %bb.0: ; FALLBACK18-NEXT: pushl %ebp ; FALLBACK18-NEXT: pushl %ebx ; FALLBACK18-NEXT: pushl %edi ; FALLBACK18-NEXT: pushl %esi ; FALLBACK18-NEXT: subl $204, %esp ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl (%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 4(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 8(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 12(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 16(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 20(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 24(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 28(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 32(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 36(%eax), %ecx ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 40(%eax), %ebp ; FALLBACK18-NEXT: movl 44(%eax), %ebx ; FALLBACK18-NEXT: movl 48(%eax), %edi ; FALLBACK18-NEXT: movl 52(%eax), %esi ; FALLBACK18-NEXT: movl 56(%eax), %edx ; FALLBACK18-NEXT: movl 60(%eax), %ecx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl (%eax), %eax ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK18-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: sarl $31, %ecx ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: leal (,%eax,8), %edx ; FALLBACK18-NEXT: andl $24, %edx ; FALLBACK18-NEXT: andl $60, %ecx ; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl %edx, %ebx ; FALLBACK18-NEXT: notb %bl ; FALLBACK18-NEXT: leal (%edi,%edi), %ebp ; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, 64(%esp,%ecx), %edi ; FALLBACK18-NEXT: addl %esi, %esi ; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK18-NEXT: orl %edi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: movl 76(%esp,%ecx), %edi ; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: movl 84(%esp,%ecx), %edi ; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK18-NEXT: leal (%esi,%esi), %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: movl 92(%esp,%ecx), %edi ; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK18-NEXT: orl %eax, %edi ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: movl 100(%esp,%ecx), %edi ; FALLBACK18-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, %esi, %esi ; FALLBACK18-NEXT: addl %edi, %edi ; FALLBACK18-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK18-NEXT: orl %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal (%eax,%eax), %esi ; FALLBACK18-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK18-NEXT: movl 108(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl %ecx, %edi ; FALLBACK18-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK18-NEXT: orl %ebp, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK18-NEXT: addl %esi, %esi ; FALLBACK18-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK18-NEXT: orl %ecx, %esi ; FALLBACK18-NEXT: movl 120(%esp,%edi), %ebp ; FALLBACK18-NEXT: leal (%ebp,%ebp), %ecx ; FALLBACK18-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK18-NEXT: movl 116(%esp,%edi), %eax ; FALLBACK18-NEXT: shrxl %edx, %eax, %edi ; FALLBACK18-NEXT: orl %edi, %ecx ; FALLBACK18-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK18-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: addl %eax, %eax ; FALLBACK18-NEXT: shlxl %ebx, %eax, %edi ; FALLBACK18-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK18-NEXT: shrxl %edx, %ebp, %eax ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK18-NEXT: movl 124(%esp,%ebp), %ebp ; FALLBACK18-NEXT: sarxl %edx, %ebp, %edx ; FALLBACK18-NEXT: addl %ebp, %ebp ; FALLBACK18-NEXT: shlxl %ebx, %ebp, %ebx ; FALLBACK18-NEXT: orl %eax, %ebx ; FALLBACK18-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK18-NEXT: movl %edx, 60(%eax) ; FALLBACK18-NEXT: movl %ebx, 56(%eax) ; FALLBACK18-NEXT: movl %edi, 48(%eax) ; FALLBACK18-NEXT: movl %ecx, 52(%eax) ; FALLBACK18-NEXT: movl %esi, 40(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 44(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 32(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 36(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 24(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 28(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 16(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 20(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 8(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 12(%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, (%eax) ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK18-NEXT: movl %ecx, 4(%eax) ; FALLBACK18-NEXT: addl $204, %esp ; FALLBACK18-NEXT: popl %esi ; FALLBACK18-NEXT: popl %edi ; FALLBACK18-NEXT: popl %ebx ; FALLBACK18-NEXT: popl %ebp ; FALLBACK18-NEXT: retl ; ; FALLBACK19-LABEL: ashr_64bytes: ; FALLBACK19: # %bb.0: ; FALLBACK19-NEXT: pushl %ebp ; FALLBACK19-NEXT: pushl %ebx ; FALLBACK19-NEXT: pushl %edi ; FALLBACK19-NEXT: pushl %esi ; FALLBACK19-NEXT: subl $188, %esp ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: movl (%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 4(%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 8(%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 12(%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 16(%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 20(%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 24(%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 28(%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 32(%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 36(%eax), %ecx ; FALLBACK19-NEXT: movl %ecx, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: movl 40(%eax), %ebp ; FALLBACK19-NEXT: movl 44(%eax), %ebx ; FALLBACK19-NEXT: movl 48(%eax), %edi ; FALLBACK19-NEXT: movl 52(%eax), %esi ; FALLBACK19-NEXT: movl 56(%eax), %edx ; FALLBACK19-NEXT: movl 60(%eax), %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK19-NEXT: movl (%ecx), %ecx ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl (%esp), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: sarl $31, %eax ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: movl %ecx, %ebp ; FALLBACK19-NEXT: andl $60, %ebp ; FALLBACK19-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shll $3, %ecx ; FALLBACK19-NEXT: andl $24, %ecx ; FALLBACK19-NEXT: shrdl %cl, %edx, %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %esi ; FALLBACK19-NEXT: shrdl %cl, %edi, %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK19-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %esi, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %edi ; FALLBACK19-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %edi, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %esi ; FALLBACK19-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 88(%esp,%ebp), %ebx ; FALLBACK19-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %edi ; FALLBACK19-NEXT: movl %edi, (%esp) # 4-byte Spill ; FALLBACK19-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK19-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %esi, %edx ; FALLBACK19-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK19-NEXT: movl 104(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl 100(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl %edi, %edx ; FALLBACK19-NEXT: shrdl %cl, %eax, %edx ; FALLBACK19-NEXT: shrdl %cl, %edi, %esi ; FALLBACK19-NEXT: movl 48(%esp,%ebp), %edi ; FALLBACK19-NEXT: movl 108(%esp,%ebp), %ebp ; FALLBACK19-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK19-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK19-NEXT: movl %eax, 56(%ebp) ; FALLBACK19-NEXT: movl %esi, 48(%ebp) ; FALLBACK19-NEXT: movl %edx, 52(%ebp) ; FALLBACK19-NEXT: movl %ebx, 40(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 44(%ebp) ; FALLBACK19-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 32(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 36(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 24(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 28(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 16(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 20(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 8(%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, 12(%ebp) ; FALLBACK19-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK19-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK19-NEXT: shrdl %cl, %edx, %edi ; FALLBACK19-NEXT: movl %edi, (%ebp) ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK19-NEXT: movl %ecx, 4(%ebp) ; FALLBACK19-NEXT: movl %eax, 60(%ebp) ; FALLBACK19-NEXT: addl $188, %esp ; FALLBACK19-NEXT: popl %esi ; FALLBACK19-NEXT: popl %edi ; FALLBACK19-NEXT: popl %ebx ; FALLBACK19-NEXT: popl %ebp ; FALLBACK19-NEXT: retl ; ; FALLBACK20-LABEL: ashr_64bytes: ; FALLBACK20: # %bb.0: ; FALLBACK20-NEXT: pushl %ebp ; FALLBACK20-NEXT: pushl %ebx ; FALLBACK20-NEXT: pushl %edi ; FALLBACK20-NEXT: pushl %esi ; FALLBACK20-NEXT: subl $204, %esp ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK20-NEXT: movups (%ecx), %xmm0 ; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK20-NEXT: movl 48(%ecx), %edx ; FALLBACK20-NEXT: movl 52(%ecx), %esi ; FALLBACK20-NEXT: movl 56(%ecx), %edi ; FALLBACK20-NEXT: movl 60(%ecx), %ecx ; FALLBACK20-NEXT: movl (%eax), %eax ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: sarl $31, %ecx ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %eax, %esi ; FALLBACK20-NEXT: andl $60, %esi ; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx ; FALLBACK20-NEXT: shll $3, %eax ; FALLBACK20-NEXT: andl $24, %eax ; FALLBACK20-NEXT: movl %edx, %edi ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK20-NEXT: movb %al, %ch ; FALLBACK20-NEXT: notb %ch ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %edi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: addl %edx, %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: orl %edi, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx ; FALLBACK20-NEXT: movl %edx, %ebp ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi ; FALLBACK20-NEXT: leal (%edi,%edi), %ebx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %ebp, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: addl %edx, %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: orl %ebx, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx ; FALLBACK20-NEXT: movl %ebx, %ebp ; FALLBACK20-NEXT: movl %eax, %edx ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: addl %eax, %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: addl %ebx, %ebx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %edi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx ; FALLBACK20-NEXT: movl %ebx, %ebp ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi ; FALLBACK20-NEXT: leal (%edi,%edi), %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: addl %ebx, %ebx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %eax, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx ; FALLBACK20-NEXT: movl %ebx, %ebp ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx ; FALLBACK20-NEXT: leal (%edx,%edx), %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: orl %ebp, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: addl %ebx, %ebx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %edi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK20-NEXT: movl %edi, %ebp ; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %ebp, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %edx ; FALLBACK20-NEXT: addl %edi, %edi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edi ; FALLBACK20-NEXT: orl %edx, %edi ; FALLBACK20-NEXT: movl %esi, %edx ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK20-NEXT: movl %esi, %ebx ; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax ; FALLBACK20-NEXT: leal (%eax,%eax), %ebp ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebp ; FALLBACK20-NEXT: orl %ebx, %ebp ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %eax ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx ; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx ; FALLBACK20-NEXT: orl %eax, %edx ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK20-NEXT: sarl %cl, %ebx ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl %ebx, 60(%eax) ; FALLBACK20-NEXT: movl %edx, 56(%eax) ; FALLBACK20-NEXT: movl %esi, 48(%eax) ; FALLBACK20-NEXT: movl %ebp, 52(%eax) ; FALLBACK20-NEXT: movl %edi, 40(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 44(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 32(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 36(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 24(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 28(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 16(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 20(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 8(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 12(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, (%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 4(%eax) ; FALLBACK20-NEXT: addl $204, %esp ; FALLBACK20-NEXT: popl %esi ; FALLBACK20-NEXT: popl %edi ; FALLBACK20-NEXT: popl %ebx ; FALLBACK20-NEXT: popl %ebp ; FALLBACK20-NEXT: retl ; ; FALLBACK21-LABEL: ashr_64bytes: ; FALLBACK21: # %bb.0: ; FALLBACK21-NEXT: pushl %ebp ; FALLBACK21-NEXT: pushl %ebx ; FALLBACK21-NEXT: pushl %edi ; FALLBACK21-NEXT: pushl %esi ; FALLBACK21-NEXT: subl $188, %esp ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK21-NEXT: movups (%eax), %xmm0 ; FALLBACK21-NEXT: movups 16(%eax), %xmm1 ; FALLBACK21-NEXT: movups 32(%eax), %xmm2 ; FALLBACK21-NEXT: movl 48(%eax), %edx ; FALLBACK21-NEXT: movl 52(%eax), %esi ; FALLBACK21-NEXT: movl 56(%eax), %edi ; FALLBACK21-NEXT: movl 60(%eax), %eax ; FALLBACK21-NEXT: movl (%ecx), %ecx ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: sarl $31, %eax ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %ecx, %ebp ; FALLBACK21-NEXT: andl $60, %ebp ; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shll $3, %ecx ; FALLBACK21-NEXT: andl $24, %ecx ; FALLBACK21-NEXT: shrdl %cl, %edx, %eax ; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %esi ; FALLBACK21-NEXT: shrdl %cl, %edi, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %edi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi ; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl %esi, %edx ; FALLBACK21-NEXT: shrdl %cl, %eax, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edi ; FALLBACK21-NEXT: shrdl %cl, %esi, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx ; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax ; FALLBACK21-NEXT: movl %eax, %edi ; FALLBACK21-NEXT: shrdl %cl, %edx, %edi ; FALLBACK21-NEXT: shrdl %cl, %eax, %esi ; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx ; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax ; FALLBACK21-NEXT: shrdl %cl, %eax, %edx ; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK21-NEXT: movl %edx, 56(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK21-NEXT: sarl %cl, %eax ; FALLBACK21-NEXT: movl %eax, 60(%ebp) ; FALLBACK21-NEXT: movl %esi, 48(%ebp) ; FALLBACK21-NEXT: movl %edi, 52(%ebp) ; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 40(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 44(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 32(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 36(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 24(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 28(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 16(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 20(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 8(%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 12(%ebp) ; FALLBACK21-NEXT: movl %ebx, (%ebp) ; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK21-NEXT: movl %eax, 4(%ebp) ; FALLBACK21-NEXT: addl $188, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi ; FALLBACK21-NEXT: popl %ebx ; FALLBACK21-NEXT: popl %ebp ; FALLBACK21-NEXT: retl ; ; FALLBACK22-LABEL: ashr_64bytes: ; FALLBACK22: # %bb.0: ; FALLBACK22-NEXT: pushl %ebp ; FALLBACK22-NEXT: pushl %ebx ; FALLBACK22-NEXT: pushl %edi ; FALLBACK22-NEXT: pushl %esi ; FALLBACK22-NEXT: subl $204, %esp ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK22-NEXT: movups (%ecx), %xmm0 ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK22-NEXT: movl 48(%ecx), %edx ; FALLBACK22-NEXT: movl 52(%ecx), %esi ; FALLBACK22-NEXT: movl 56(%ecx), %edi ; FALLBACK22-NEXT: movl 60(%ecx), %ecx ; FALLBACK22-NEXT: movl (%eax), %eax ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: sarl $31, %ecx ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %eax, %ecx ; FALLBACK22-NEXT: leal (,%eax,8), %edx ; FALLBACK22-NEXT: andl $24, %edx ; FALLBACK22-NEXT: andl $60, %ecx ; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl %edx, %ebx ; FALLBACK22-NEXT: notb %bl ; FALLBACK22-NEXT: leal (%edi,%edi), %ebp ; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax ; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi ; FALLBACK22-NEXT: addl %esi, %esi ; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK22-NEXT: orl %edi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi ; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi ; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi ; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi ; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %esi ; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK22-NEXT: movl 108(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl %ecx, %edi ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %esi, %esi ; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK22-NEXT: orl %ecx, %esi ; FALLBACK22-NEXT: movl 120(%esp,%edi), %ebp ; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx ; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK22-NEXT: movl 116(%esp,%edi), %eax ; FALLBACK22-NEXT: shrxl %edx, %eax, %edi ; FALLBACK22-NEXT: orl %edi, %ecx ; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: addl %eax, %eax ; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi ; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp ; FALLBACK22-NEXT: sarxl %edx, %ebp, %edx ; FALLBACK22-NEXT: addl %ebp, %ebp ; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx ; FALLBACK22-NEXT: orl %eax, %ebx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK22-NEXT: movl %edx, 60(%eax) ; FALLBACK22-NEXT: movl %ebx, 56(%eax) ; FALLBACK22-NEXT: movl %edi, 48(%eax) ; FALLBACK22-NEXT: movl %ecx, 52(%eax) ; FALLBACK22-NEXT: movl %esi, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 32(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 36(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 24(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 28(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 16(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 20(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 8(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 12(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, (%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 4(%eax) ; FALLBACK22-NEXT: addl $204, %esp ; FALLBACK22-NEXT: popl %esi ; FALLBACK22-NEXT: popl %edi ; FALLBACK22-NEXT: popl %ebx ; FALLBACK22-NEXT: popl %ebp ; FALLBACK22-NEXT: retl ; ; FALLBACK23-LABEL: ashr_64bytes: ; FALLBACK23: # %bb.0: ; FALLBACK23-NEXT: pushl %ebp ; FALLBACK23-NEXT: pushl %ebx ; FALLBACK23-NEXT: pushl %edi ; FALLBACK23-NEXT: pushl %esi ; FALLBACK23-NEXT: subl $188, %esp ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movups (%eax), %xmm0 ; FALLBACK23-NEXT: movups 16(%eax), %xmm1 ; FALLBACK23-NEXT: movups 32(%eax), %xmm2 ; FALLBACK23-NEXT: movl 48(%eax), %edx ; FALLBACK23-NEXT: movl 52(%eax), %esi ; FALLBACK23-NEXT: movl 56(%eax), %edi ; FALLBACK23-NEXT: movl 60(%eax), %eax ; FALLBACK23-NEXT: movl (%ecx), %ecx ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: sarl $31, %eax ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movl %ecx, %ebp ; FALLBACK23-NEXT: andl $60, %ebp ; FALLBACK23-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shll $3, %ecx ; FALLBACK23-NEXT: andl $24, %ecx ; FALLBACK23-NEXT: shrdl %cl, %edx, %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %esi ; FALLBACK23-NEXT: shrdl %cl, %edi, %esi ; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK23-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %esi, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %edi, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %esi ; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 88(%esp,%ebp), %ebx ; FALLBACK23-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK23-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %esi, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK23-NEXT: movl 104(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl 100(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl %edi, %edx ; FALLBACK23-NEXT: shrdl %cl, %eax, %edx ; FALLBACK23-NEXT: shrdl %cl, %edi, %esi ; FALLBACK23-NEXT: movl 48(%esp,%ebp), %edi ; FALLBACK23-NEXT: movl 108(%esp,%ebp), %ebp ; FALLBACK23-NEXT: movl %ebp, (%esp) # 4-byte Spill ; FALLBACK23-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK23-NEXT: movl %eax, 56(%ebp) ; FALLBACK23-NEXT: movl %esi, 48(%ebp) ; FALLBACK23-NEXT: movl %edx, 52(%ebp) ; FALLBACK23-NEXT: movl %ebx, 40(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 44(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 32(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 36(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 24(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 28(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 16(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 20(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 8(%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK23-NEXT: movl %eax, 12(%ebp) ; FALLBACK23-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK23-NEXT: shrdl %cl, %edx, %edi ; FALLBACK23-NEXT: movl %edi, (%ebp) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 4(%ebp) ; FALLBACK23-NEXT: movl %eax, 60(%ebp) ; FALLBACK23-NEXT: addl $188, %esp ; FALLBACK23-NEXT: popl %esi ; FALLBACK23-NEXT: popl %edi ; FALLBACK23-NEXT: popl %ebx ; FALLBACK23-NEXT: popl %ebp ; FALLBACK23-NEXT: retl ; ; FALLBACK24-LABEL: ashr_64bytes: ; FALLBACK24: # %bb.0: ; FALLBACK24-NEXT: pushl %ebp ; FALLBACK24-NEXT: pushl %ebx ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi ; FALLBACK24-NEXT: subl $204, %esp ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK24-NEXT: vmovups 32(%ecx), %xmm1 ; FALLBACK24-NEXT: movl 48(%ecx), %edx ; FALLBACK24-NEXT: movl 52(%ecx), %esi ; FALLBACK24-NEXT: movl 56(%ecx), %edi ; FALLBACK24-NEXT: movl 60(%ecx), %ecx ; FALLBACK24-NEXT: movl (%eax), %eax ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: sarl $31, %ecx ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %eax, %esi ; FALLBACK24-NEXT: andl $60, %esi ; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx ; FALLBACK24-NEXT: shll $3, %eax ; FALLBACK24-NEXT: andl $24, %eax ; FALLBACK24-NEXT: movl %edx, %edi ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK24-NEXT: movb %al, %ch ; FALLBACK24-NEXT: notb %ch ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %edi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: addl %edx, %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: orl %edi, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx ; FALLBACK24-NEXT: movl %edx, %ebp ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi ; FALLBACK24-NEXT: leal (%edi,%edi), %ebx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %ebp, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: addl %edx, %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: orl %ebx, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx ; FALLBACK24-NEXT: movl %ebx, %ebp ; FALLBACK24-NEXT: movl %eax, %edx ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: addl %eax, %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax ; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: addl %ebx, %ebx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %edi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx ; FALLBACK24-NEXT: movl %ebx, %ebp ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi ; FALLBACK24-NEXT: leal (%edi,%edi), %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax ; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: addl %ebx, %ebx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %eax, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx ; FALLBACK24-NEXT: movl %ebx, %ebp ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx ; FALLBACK24-NEXT: leal (%edx,%edx), %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax ; FALLBACK24-NEXT: orl %ebp, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: addl %ebx, %ebx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %edi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK24-NEXT: movl %edi, %ebp ; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %ebp, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %edx ; FALLBACK24-NEXT: addl %edi, %edi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edi ; FALLBACK24-NEXT: orl %edx, %edi ; FALLBACK24-NEXT: movl %esi, %edx ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK24-NEXT: movl %esi, %ebx ; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax ; FALLBACK24-NEXT: leal (%eax,%eax), %ebp ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebp ; FALLBACK24-NEXT: orl %ebx, %ebp ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %eax ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx ; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx ; FALLBACK24-NEXT: orl %eax, %edx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK24-NEXT: sarl %cl, %ebx ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl %ebx, 60(%eax) ; FALLBACK24-NEXT: movl %edx, 56(%eax) ; FALLBACK24-NEXT: movl %esi, 48(%eax) ; FALLBACK24-NEXT: movl %ebp, 52(%eax) ; FALLBACK24-NEXT: movl %edi, 40(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 44(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 32(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 36(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 24(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 28(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 16(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 20(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 8(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 12(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, (%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 4(%eax) ; FALLBACK24-NEXT: addl $204, %esp ; FALLBACK24-NEXT: popl %esi ; FALLBACK24-NEXT: popl %edi ; FALLBACK24-NEXT: popl %ebx ; FALLBACK24-NEXT: popl %ebp ; FALLBACK24-NEXT: vzeroupper ; FALLBACK24-NEXT: retl ; ; FALLBACK25-LABEL: ashr_64bytes: ; FALLBACK25: # %bb.0: ; FALLBACK25-NEXT: pushl %ebp ; FALLBACK25-NEXT: pushl %ebx ; FALLBACK25-NEXT: pushl %edi ; FALLBACK25-NEXT: pushl %esi ; FALLBACK25-NEXT: subl $188, %esp ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK25-NEXT: vmovups (%eax), %ymm0 ; FALLBACK25-NEXT: vmovups 32(%eax), %xmm1 ; FALLBACK25-NEXT: movl 48(%eax), %edx ; FALLBACK25-NEXT: movl 52(%eax), %esi ; FALLBACK25-NEXT: movl 56(%eax), %edi ; FALLBACK25-NEXT: movl 60(%eax), %eax ; FALLBACK25-NEXT: movl (%ecx), %ecx ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: sarl $31, %eax ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %ecx, %ebp ; FALLBACK25-NEXT: andl $60, %ebp ; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shll $3, %ecx ; FALLBACK25-NEXT: andl $24, %ecx ; FALLBACK25-NEXT: shrdl %cl, %edx, %eax ; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %esi ; FALLBACK25-NEXT: shrdl %cl, %edi, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %edi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi ; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl %esi, %edx ; FALLBACK25-NEXT: shrdl %cl, %eax, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edi ; FALLBACK25-NEXT: shrdl %cl, %esi, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx ; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax ; FALLBACK25-NEXT: movl %eax, %edi ; FALLBACK25-NEXT: shrdl %cl, %edx, %edi ; FALLBACK25-NEXT: shrdl %cl, %eax, %esi ; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx ; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax ; FALLBACK25-NEXT: shrdl %cl, %eax, %edx ; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK25-NEXT: movl %edx, 56(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK25-NEXT: sarl %cl, %eax ; FALLBACK25-NEXT: movl %eax, 60(%ebp) ; FALLBACK25-NEXT: movl %esi, 48(%ebp) ; FALLBACK25-NEXT: movl %edi, 52(%ebp) ; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 40(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 44(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 32(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 36(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 24(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 28(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 16(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 20(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 8(%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 12(%ebp) ; FALLBACK25-NEXT: movl %ebx, (%ebp) ; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK25-NEXT: movl %eax, 4(%ebp) ; FALLBACK25-NEXT: addl $188, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi ; FALLBACK25-NEXT: popl %ebx ; FALLBACK25-NEXT: popl %ebp ; FALLBACK25-NEXT: vzeroupper ; FALLBACK25-NEXT: retl ; ; FALLBACK26-LABEL: ashr_64bytes: ; FALLBACK26: # %bb.0: ; FALLBACK26-NEXT: pushl %ebp ; FALLBACK26-NEXT: pushl %ebx ; FALLBACK26-NEXT: pushl %edi ; FALLBACK26-NEXT: pushl %esi ; FALLBACK26-NEXT: subl $204, %esp ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK26-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK26-NEXT: vmovups 32(%ecx), %xmm1 ; FALLBACK26-NEXT: movl 48(%ecx), %edx ; FALLBACK26-NEXT: movl 52(%ecx), %esi ; FALLBACK26-NEXT: movl 56(%ecx), %edi ; FALLBACK26-NEXT: movl 60(%ecx), %ecx ; FALLBACK26-NEXT: movl (%eax), %eax ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: sarl $31, %ecx ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %eax, %ecx ; FALLBACK26-NEXT: leal (,%eax,8), %edx ; FALLBACK26-NEXT: andl $24, %edx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl %edx, %ebx ; FALLBACK26-NEXT: notb %bl ; FALLBACK26-NEXT: leal (%edi,%edi), %ebp ; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax ; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, 64(%esp,%ecx), %edi ; FALLBACK26-NEXT: addl %esi, %esi ; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK26-NEXT: orl %edi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: movl 76(%esp,%ecx), %edi ; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: movl 84(%esp,%ecx), %edi ; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK26-NEXT: leal (%esi,%esi), %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: movl 92(%esp,%ecx), %edi ; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK26-NEXT: orl %eax, %edi ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: movl 100(%esp,%ecx), %edi ; FALLBACK26-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, %esi, %esi ; FALLBACK26-NEXT: addl %edi, %edi ; FALLBACK26-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK26-NEXT: orl %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal (%eax,%eax), %esi ; FALLBACK26-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK26-NEXT: movl 108(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl %ecx, %edi ; FALLBACK26-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK26-NEXT: orl %ebp, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK26-NEXT: addl %esi, %esi ; FALLBACK26-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK26-NEXT: orl %ecx, %esi ; FALLBACK26-NEXT: movl 120(%esp,%edi), %ebp ; FALLBACK26-NEXT: leal (%ebp,%ebp), %ecx ; FALLBACK26-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK26-NEXT: movl 116(%esp,%edi), %eax ; FALLBACK26-NEXT: shrxl %edx, %eax, %edi ; FALLBACK26-NEXT: orl %edi, %ecx ; FALLBACK26-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK26-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: addl %eax, %eax ; FALLBACK26-NEXT: shlxl %ebx, %eax, %edi ; FALLBACK26-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK26-NEXT: shrxl %edx, %ebp, %eax ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK26-NEXT: movl 124(%esp,%ebp), %ebp ; FALLBACK26-NEXT: sarxl %edx, %ebp, %edx ; FALLBACK26-NEXT: addl %ebp, %ebp ; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebx ; FALLBACK26-NEXT: orl %eax, %ebx ; FALLBACK26-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK26-NEXT: movl %edx, 60(%eax) ; FALLBACK26-NEXT: movl %ebx, 56(%eax) ; FALLBACK26-NEXT: movl %edi, 48(%eax) ; FALLBACK26-NEXT: movl %ecx, 52(%eax) ; FALLBACK26-NEXT: movl %esi, 40(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 44(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 32(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 36(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 24(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 28(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 16(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 20(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 8(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 12(%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, (%eax) ; FALLBACK26-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK26-NEXT: movl %ecx, 4(%eax) ; FALLBACK26-NEXT: addl $204, %esp ; FALLBACK26-NEXT: popl %esi ; FALLBACK26-NEXT: popl %edi ; FALLBACK26-NEXT: popl %ebx ; FALLBACK26-NEXT: popl %ebp ; FALLBACK26-NEXT: vzeroupper ; FALLBACK26-NEXT: retl ; ; FALLBACK27-LABEL: ashr_64bytes: ; FALLBACK27: # %bb.0: ; FALLBACK27-NEXT: pushl %ebp ; FALLBACK27-NEXT: pushl %ebx ; FALLBACK27-NEXT: pushl %edi ; FALLBACK27-NEXT: pushl %esi ; FALLBACK27-NEXT: subl $188, %esp ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: vmovups (%eax), %ymm0 ; FALLBACK27-NEXT: vmovups 32(%eax), %xmm1 ; FALLBACK27-NEXT: movl 48(%eax), %edx ; FALLBACK27-NEXT: movl 52(%eax), %esi ; FALLBACK27-NEXT: movl 56(%eax), %edi ; FALLBACK27-NEXT: movl 60(%eax), %eax ; FALLBACK27-NEXT: movl (%ecx), %ecx ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: sarl $31, %eax ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: movl %ecx, %ebp ; FALLBACK27-NEXT: andl $60, %ebp ; FALLBACK27-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shll $3, %ecx ; FALLBACK27-NEXT: andl $24, %ecx ; FALLBACK27-NEXT: shrdl %cl, %edx, %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %esi ; FALLBACK27-NEXT: shrdl %cl, %edi, %esi ; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK27-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %esi, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %edi, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %esi ; FALLBACK27-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 88(%esp,%ebp), %ebx ; FALLBACK27-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %edi ; FALLBACK27-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK27-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %esi, %edx ; FALLBACK27-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK27-NEXT: movl 104(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl 100(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl %edi, %edx ; FALLBACK27-NEXT: shrdl %cl, %eax, %edx ; FALLBACK27-NEXT: shrdl %cl, %edi, %esi ; FALLBACK27-NEXT: movl 48(%esp,%ebp), %edi ; FALLBACK27-NEXT: movl 108(%esp,%ebp), %ebp ; FALLBACK27-NEXT: movl %ebp, (%esp) # 4-byte Spill ; FALLBACK27-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK27-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK27-NEXT: movl %eax, 56(%ebp) ; FALLBACK27-NEXT: movl %esi, 48(%ebp) ; FALLBACK27-NEXT: movl %edx, 52(%ebp) ; FALLBACK27-NEXT: movl %ebx, 40(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 44(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 32(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 36(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 24(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 28(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 16(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 20(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 8(%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK27-NEXT: movl %eax, 12(%ebp) ; FALLBACK27-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK27-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK27-NEXT: shrdl %cl, %edx, %edi ; FALLBACK27-NEXT: movl %edi, (%ebp) ; FALLBACK27-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK27-NEXT: movl %ecx, 4(%ebp) ; FALLBACK27-NEXT: movl %eax, 60(%ebp) ; FALLBACK27-NEXT: addl $188, %esp ; FALLBACK27-NEXT: popl %esi ; FALLBACK27-NEXT: popl %edi ; FALLBACK27-NEXT: popl %ebx ; FALLBACK27-NEXT: popl %ebp ; FALLBACK27-NEXT: vzeroupper ; FALLBACK27-NEXT: retl ; ; FALLBACK28-LABEL: ashr_64bytes: ; FALLBACK28: # %bb.0: ; FALLBACK28-NEXT: pushl %ebp ; FALLBACK28-NEXT: pushl %ebx ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi ; FALLBACK28-NEXT: subl $204, %esp ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK28-NEXT: vmovups 32(%ecx), %xmm1 ; FALLBACK28-NEXT: movl 48(%ecx), %edx ; FALLBACK28-NEXT: movl 52(%ecx), %esi ; FALLBACK28-NEXT: movl 56(%ecx), %edi ; FALLBACK28-NEXT: movl 60(%ecx), %ecx ; FALLBACK28-NEXT: movl (%eax), %eax ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: sarl $31, %ecx ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %eax, %esi ; FALLBACK28-NEXT: andl $60, %esi ; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx ; FALLBACK28-NEXT: shll $3, %eax ; FALLBACK28-NEXT: andl $24, %eax ; FALLBACK28-NEXT: movl %edx, %edi ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK28-NEXT: movb %al, %ch ; FALLBACK28-NEXT: notb %ch ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %edi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: addl %edx, %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: orl %edi, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx ; FALLBACK28-NEXT: movl %edx, %ebp ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi ; FALLBACK28-NEXT: leal (%edi,%edi), %ebx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %ebp, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: addl %edx, %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: orl %ebx, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx ; FALLBACK28-NEXT: movl %ebx, %ebp ; FALLBACK28-NEXT: movl %eax, %edx ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: addl %eax, %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: addl %ebx, %ebx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %edi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx ; FALLBACK28-NEXT: movl %ebx, %ebp ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi ; FALLBACK28-NEXT: leal (%edi,%edi), %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: addl %ebx, %ebx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %eax, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx ; FALLBACK28-NEXT: movl %ebx, %ebp ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx ; FALLBACK28-NEXT: leal (%edx,%edx), %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: orl %ebp, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: addl %ebx, %ebx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %edi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK28-NEXT: movl %edi, %ebp ; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx ; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %ebp, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %edx ; FALLBACK28-NEXT: addl %edi, %edi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edi ; FALLBACK28-NEXT: orl %edx, %edi ; FALLBACK28-NEXT: movl %esi, %edx ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK28-NEXT: movl %esi, %ebx ; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax ; FALLBACK28-NEXT: leal (%eax,%eax), %ebp ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebp ; FALLBACK28-NEXT: orl %ebx, %ebp ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %eax ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx ; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx ; FALLBACK28-NEXT: orl %eax, %edx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK28-NEXT: sarl %cl, %ebx ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl %ebx, 60(%eax) ; FALLBACK28-NEXT: movl %edx, 56(%eax) ; FALLBACK28-NEXT: movl %esi, 48(%eax) ; FALLBACK28-NEXT: movl %ebp, 52(%eax) ; FALLBACK28-NEXT: movl %edi, 40(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 44(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 32(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 36(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 24(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 28(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 16(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 20(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 8(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 12(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, (%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 4(%eax) ; FALLBACK28-NEXT: addl $204, %esp ; FALLBACK28-NEXT: popl %esi ; FALLBACK28-NEXT: popl %edi ; FALLBACK28-NEXT: popl %ebx ; FALLBACK28-NEXT: popl %ebp ; FALLBACK28-NEXT: vzeroupper ; FALLBACK28-NEXT: retl ; ; FALLBACK29-LABEL: ashr_64bytes: ; FALLBACK29: # %bb.0: ; FALLBACK29-NEXT: pushl %ebp ; FALLBACK29-NEXT: pushl %ebx ; FALLBACK29-NEXT: pushl %edi ; FALLBACK29-NEXT: pushl %esi ; FALLBACK29-NEXT: subl $188, %esp ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK29-NEXT: vmovups (%eax), %ymm0 ; FALLBACK29-NEXT: vmovups 32(%eax), %xmm1 ; FALLBACK29-NEXT: movl 48(%eax), %edx ; FALLBACK29-NEXT: movl 52(%eax), %esi ; FALLBACK29-NEXT: movl 56(%eax), %edi ; FALLBACK29-NEXT: movl 60(%eax), %eax ; FALLBACK29-NEXT: movl (%ecx), %ecx ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: sarl $31, %eax ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %ecx, %ebp ; FALLBACK29-NEXT: andl $60, %ebp ; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shll $3, %ecx ; FALLBACK29-NEXT: andl $24, %ecx ; FALLBACK29-NEXT: shrdl %cl, %edx, %eax ; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %esi ; FALLBACK29-NEXT: shrdl %cl, %edi, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %edi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi ; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl %esi, %edx ; FALLBACK29-NEXT: shrdl %cl, %eax, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edi ; FALLBACK29-NEXT: shrdl %cl, %esi, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill ; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx ; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax ; FALLBACK29-NEXT: movl %eax, %edi ; FALLBACK29-NEXT: shrdl %cl, %edx, %edi ; FALLBACK29-NEXT: shrdl %cl, %eax, %esi ; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx ; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax ; FALLBACK29-NEXT: shrdl %cl, %eax, %edx ; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK29-NEXT: movl %edx, 56(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx ; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK29-NEXT: sarl %cl, %eax ; FALLBACK29-NEXT: movl %eax, 60(%ebp) ; FALLBACK29-NEXT: movl %esi, 48(%ebp) ; FALLBACK29-NEXT: movl %edi, 52(%ebp) ; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 40(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 44(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 32(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 36(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 24(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 28(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 16(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 20(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 8(%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 12(%ebp) ; FALLBACK29-NEXT: movl %ebx, (%ebp) ; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK29-NEXT: movl %eax, 4(%ebp) ; FALLBACK29-NEXT: addl $188, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi ; FALLBACK29-NEXT: popl %ebx ; FALLBACK29-NEXT: popl %ebp ; FALLBACK29-NEXT: vzeroupper ; FALLBACK29-NEXT: retl ; ; FALLBACK30-LABEL: ashr_64bytes: ; FALLBACK30: # %bb.0: ; FALLBACK30-NEXT: pushl %ebp ; FALLBACK30-NEXT: pushl %ebx ; FALLBACK30-NEXT: pushl %edi ; FALLBACK30-NEXT: pushl %esi ; FALLBACK30-NEXT: subl $204, %esp ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK30-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK30-NEXT: vmovups 32(%ecx), %xmm1 ; FALLBACK30-NEXT: movl 48(%ecx), %edx ; FALLBACK30-NEXT: movl 52(%ecx), %esi ; FALLBACK30-NEXT: movl 56(%ecx), %edi ; FALLBACK30-NEXT: movl 60(%ecx), %ecx ; FALLBACK30-NEXT: movl (%eax), %eax ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: sarl $31, %ecx ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %eax, %ecx ; FALLBACK30-NEXT: leal (,%eax,8), %edx ; FALLBACK30-NEXT: andl $24, %edx ; FALLBACK30-NEXT: andl $60, %ecx ; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl %edx, %ebx ; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: leal (%edi,%edi), %ebp ; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax ; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, 64(%esp,%ecx), %edi ; FALLBACK30-NEXT: addl %esi, %esi ; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK30-NEXT: orl %edi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 80(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: movl 76(%esp,%ecx), %edi ; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 88(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: movl 84(%esp,%ecx), %edi ; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 96(%esp,%ecx), %esi ; FALLBACK30-NEXT: leal (%esi,%esi), %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: movl 92(%esp,%ecx), %edi ; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK30-NEXT: orl %eax, %edi ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 104(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: movl 100(%esp,%ecx), %edi ; FALLBACK30-NEXT: shrxl %edx, %edi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, %esi, %esi ; FALLBACK30-NEXT: addl %edi, %edi ; FALLBACK30-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK30-NEXT: orl %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl 112(%esp,%ecx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal (%eax,%eax), %esi ; FALLBACK30-NEXT: shlxl %ebx, %esi, %eax ; FALLBACK30-NEXT: movl 108(%esp,%ecx), %esi ; FALLBACK30-NEXT: movl %ecx, %edi ; FALLBACK30-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, %esi, %ebp ; FALLBACK30-NEXT: orl %ebp, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; FALLBACK30-NEXT: addl %esi, %esi ; FALLBACK30-NEXT: shlxl %ebx, %esi, %esi ; FALLBACK30-NEXT: orl %ecx, %esi ; FALLBACK30-NEXT: movl 120(%esp,%edi), %ebp ; FALLBACK30-NEXT: leal (%ebp,%ebp), %ecx ; FALLBACK30-NEXT: shlxl %ebx, %ecx, %ecx ; FALLBACK30-NEXT: movl 116(%esp,%edi), %eax ; FALLBACK30-NEXT: shrxl %edx, %eax, %edi ; FALLBACK30-NEXT: orl %edi, %ecx ; FALLBACK30-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK30-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: addl %eax, %eax ; FALLBACK30-NEXT: shlxl %ebx, %eax, %edi ; FALLBACK30-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK30-NEXT: shrxl %edx, %ebp, %eax ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK30-NEXT: movl 124(%esp,%ebp), %ebp ; FALLBACK30-NEXT: sarxl %edx, %ebp, %edx ; FALLBACK30-NEXT: addl %ebp, %ebp ; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebx ; FALLBACK30-NEXT: orl %eax, %ebx ; FALLBACK30-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK30-NEXT: movl %edx, 60(%eax) ; FALLBACK30-NEXT: movl %ebx, 56(%eax) ; FALLBACK30-NEXT: movl %edi, 48(%eax) ; FALLBACK30-NEXT: movl %ecx, 52(%eax) ; FALLBACK30-NEXT: movl %esi, 40(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 44(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 32(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 36(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 24(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 28(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 16(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 20(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 8(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 12(%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, (%eax) ; FALLBACK30-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK30-NEXT: movl %ecx, 4(%eax) ; FALLBACK30-NEXT: addl $204, %esp ; FALLBACK30-NEXT: popl %esi ; FALLBACK30-NEXT: popl %edi ; FALLBACK30-NEXT: popl %ebx ; FALLBACK30-NEXT: popl %ebp ; FALLBACK30-NEXT: vzeroupper ; FALLBACK30-NEXT: retl ; ; FALLBACK31-LABEL: ashr_64bytes: ; FALLBACK31: # %bb.0: ; FALLBACK31-NEXT: pushl %ebp ; FALLBACK31-NEXT: pushl %ebx ; FALLBACK31-NEXT: pushl %edi ; FALLBACK31-NEXT: pushl %esi ; FALLBACK31-NEXT: subl $188, %esp ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: vmovups (%eax), %ymm0 ; FALLBACK31-NEXT: vmovups 32(%eax), %xmm1 ; FALLBACK31-NEXT: movl 48(%eax), %edx ; FALLBACK31-NEXT: movl 52(%eax), %esi ; FALLBACK31-NEXT: movl 56(%eax), %edi ; FALLBACK31-NEXT: movl 60(%eax), %eax ; FALLBACK31-NEXT: movl (%ecx), %ecx ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: sarl $31, %eax ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: movl %ecx, %ebp ; FALLBACK31-NEXT: andl $60, %ebp ; FALLBACK31-NEXT: movl 56(%esp,%ebp), %edx ; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shll $3, %ecx ; FALLBACK31-NEXT: andl $24, %ecx ; FALLBACK31-NEXT: shrdl %cl, %edx, %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl 60(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %esi ; FALLBACK31-NEXT: shrdl %cl, %edi, %esi ; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 72(%esp,%ebp), %esi ; FALLBACK31-NEXT: movl 68(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %esi, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 80(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl 76(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %edi, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %esi ; FALLBACK31-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 88(%esp,%ebp), %ebx ; FALLBACK31-NEXT: movl 84(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %ebx, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %edi ; FALLBACK31-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 96(%esp,%ebp), %esi ; FALLBACK31-NEXT: movl 92(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %esi, %edx ; FALLBACK31-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %eax, %ebx ; FALLBACK31-NEXT: movl 104(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl 100(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl %edi, %edx ; FALLBACK31-NEXT: shrdl %cl, %eax, %edx ; FALLBACK31-NEXT: shrdl %cl, %edi, %esi ; FALLBACK31-NEXT: movl 48(%esp,%ebp), %edi ; FALLBACK31-NEXT: movl 108(%esp,%ebp), %ebp ; FALLBACK31-NEXT: movl %ebp, (%esp) # 4-byte Spill ; FALLBACK31-NEXT: shrdl %cl, %ebp, %eax ; FALLBACK31-NEXT: movl {{[0-9]+}}(%esp), %ebp ; FALLBACK31-NEXT: movl %eax, 56(%ebp) ; FALLBACK31-NEXT: movl %esi, 48(%ebp) ; FALLBACK31-NEXT: movl %edx, 52(%ebp) ; FALLBACK31-NEXT: movl %ebx, 40(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 44(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 32(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 36(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 24(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 28(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 16(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 20(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 8(%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK31-NEXT: movl %eax, 12(%ebp) ; FALLBACK31-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload ; FALLBACK31-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK31-NEXT: shrdl %cl, %edx, %edi ; FALLBACK31-NEXT: movl %edi, (%ebp) ; FALLBACK31-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK31-NEXT: movl %ecx, 4(%ebp) ; FALLBACK31-NEXT: movl %eax, 60(%ebp) ; FALLBACK31-NEXT: addl $188, %esp ; FALLBACK31-NEXT: popl %esi ; FALLBACK31-NEXT: popl %edi ; FALLBACK31-NEXT: popl %ebx ; FALLBACK31-NEXT: popl %ebp ; FALLBACK31-NEXT: vzeroupper ; FALLBACK31-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %byteOff = load i512, ptr %byteOff.ptr, align 1 %bitOff = shl i512 %byteOff, 3 %res = ashr i512 %src, %bitOff store i512 %res, ptr %dst, align 1 ret void } define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nounwind { ; X64-SSE2-LABEL: ashr_64bytes_qwordOff: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pushq %rbx ; X64-SSE2-NEXT: movq (%rdi), %rax ; X64-SSE2-NEXT: movq 8(%rdi), %rcx ; X64-SSE2-NEXT: movq 16(%rdi), %r8 ; X64-SSE2-NEXT: movq 24(%rdi), %r9 ; X64-SSE2-NEXT: movq 32(%rdi), %r10 ; X64-SSE2-NEXT: movq 40(%rdi), %r11 ; X64-SSE2-NEXT: movq 48(%rdi), %rbx ; X64-SSE2-NEXT: movq 56(%rdi), %rdi ; X64-SSE2-NEXT: movl (%rsi), %esi ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: sarq $63, %rdi ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-SSE2-NEXT: andl $7, %esi ; X64-SSE2-NEXT: movq -128(%rsp,%rsi,8), %rax ; X64-SSE2-NEXT: movq -120(%rsp,%rsi,8), %rcx ; X64-SSE2-NEXT: movq -104(%rsp,%rsi,8), %rdi ; X64-SSE2-NEXT: movq -112(%rsp,%rsi,8), %r8 ; X64-SSE2-NEXT: movq -88(%rsp,%rsi,8), %r9 ; X64-SSE2-NEXT: movq -96(%rsp,%rsi,8), %r10 ; X64-SSE2-NEXT: movq -72(%rsp,%rsi,8), %r11 ; X64-SSE2-NEXT: movq -80(%rsp,%rsi,8), %rsi ; X64-SSE2-NEXT: movq %rsi, 48(%rdx) ; X64-SSE2-NEXT: movq %r11, 56(%rdx) ; X64-SSE2-NEXT: movq %r10, 32(%rdx) ; X64-SSE2-NEXT: movq %r9, 40(%rdx) ; X64-SSE2-NEXT: movq %r8, 16(%rdx) ; X64-SSE2-NEXT: movq %rdi, 24(%rdx) ; X64-SSE2-NEXT: movq %rax, (%rdx) ; X64-SSE2-NEXT: movq %rcx, 8(%rdx) ; X64-SSE2-NEXT: popq %rbx ; X64-SSE2-NEXT: retq ; ; X64-SSE42-LABEL: ashr_64bytes_qwordOff: ; X64-SSE42: # %bb.0: ; X64-SSE42-NEXT: pushq %rax ; X64-SSE42-NEXT: movups (%rdi), %xmm0 ; X64-SSE42-NEXT: movups 16(%rdi), %xmm1 ; X64-SSE42-NEXT: movups 32(%rdi), %xmm2 ; X64-SSE42-NEXT: movq 48(%rdi), %rax ; X64-SSE42-NEXT: movq 56(%rdi), %rcx ; X64-SSE42-NEXT: movl (%rsi), %esi ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: sarq $63, %rcx ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-SSE42-NEXT: andl $7, %esi ; X64-SSE42-NEXT: movups -128(%rsp,%rsi,8), %xmm0 ; X64-SSE42-NEXT: movups -112(%rsp,%rsi,8), %xmm1 ; X64-SSE42-NEXT: movups -96(%rsp,%rsi,8), %xmm2 ; X64-SSE42-NEXT: movups -80(%rsp,%rsi,8), %xmm3 ; X64-SSE42-NEXT: movups %xmm3, 48(%rdx) ; X64-SSE42-NEXT: movups %xmm1, 16(%rdx) ; X64-SSE42-NEXT: movups %xmm2, 32(%rdx) ; X64-SSE42-NEXT: movups %xmm0, (%rdx) ; X64-SSE42-NEXT: popq %rax ; X64-SSE42-NEXT: retq ; ; X64-AVX-LABEL: ashr_64bytes_qwordOff: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: pushq %rax ; X64-AVX-NEXT: vmovups (%rdi), %ymm0 ; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1 ; X64-AVX-NEXT: movq 48(%rdi), %rax ; X64-AVX-NEXT: movq 56(%rdi), %rcx ; X64-AVX-NEXT: movl (%rsi), %esi ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: sarq $63, %rcx ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-AVX-NEXT: andl $7, %esi ; X64-AVX-NEXT: vmovups -128(%rsp,%rsi,8), %xmm0 ; X64-AVX-NEXT: vmovups -112(%rsp,%rsi,8), %xmm1 ; X64-AVX-NEXT: vmovups -96(%rsp,%rsi,8), %xmm2 ; X64-AVX-NEXT: vmovups -80(%rsp,%rsi,8), %xmm3 ; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx) ; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx) ; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx) ; X64-AVX-NEXT: vmovups %xmm0, (%rdx) ; X64-AVX-NEXT: popq %rax ; X64-AVX-NEXT: vzeroupper ; X64-AVX-NEXT: retq ; ; X86-SSE2-LABEL: ashr_64bytes_qwordOff: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi ; X86-SSE2-NEXT: subl $188, %esp ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 4(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 8(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 12(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 16(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 20(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 24(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 28(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 32(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 36(%eax), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 40(%eax), %ebp ; X86-SSE2-NEXT: movl 44(%eax), %ebx ; X86-SSE2-NEXT: movl 48(%eax), %edi ; X86-SSE2-NEXT: movl 52(%eax), %esi ; X86-SSE2-NEXT: movl 56(%eax), %edx ; X86-SSE2-NEXT: movl 60(%eax), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl (%eax), %eax ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-SSE2-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: sarl $31, %ecx ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-SSE2-NEXT: andl $7, %eax ; X86-SSE2-NEXT: movl 48(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 52(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 60(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 56(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 68(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 64(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 76(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 72(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 84(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 80(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SSE2-NEXT: movl 92(%esp,%eax,8), %ebp ; X86-SSE2-NEXT: movl 88(%esp,%eax,8), %ebx ; X86-SSE2-NEXT: movl 100(%esp,%eax,8), %edi ; X86-SSE2-NEXT: movl 96(%esp,%eax,8), %esi ; X86-SSE2-NEXT: movl 108(%esp,%eax,8), %edx ; X86-SSE2-NEXT: movl 104(%esp,%eax,8), %ecx ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE2-NEXT: movl %ecx, 56(%eax) ; X86-SSE2-NEXT: movl %edx, 60(%eax) ; X86-SSE2-NEXT: movl %esi, 48(%eax) ; X86-SSE2-NEXT: movl %edi, 52(%eax) ; X86-SSE2-NEXT: movl %ebx, 40(%eax) ; X86-SSE2-NEXT: movl %ebp, 44(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 32(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 36(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 24(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 28(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 16(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 20(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 12(%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, (%eax) ; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: addl $188, %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx ; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl ; ; X86-SSE42-LABEL: ashr_64bytes_qwordOff: ; X86-SSE42: # %bb.0: ; X86-SSE42-NEXT: pushl %ebx ; X86-SSE42-NEXT: pushl %edi ; X86-SSE42-NEXT: pushl %esi ; X86-SSE42-NEXT: subl $128, %esp ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE42-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-SSE42-NEXT: movups (%edx), %xmm0 ; X86-SSE42-NEXT: movups 16(%edx), %xmm1 ; X86-SSE42-NEXT: movups 32(%edx), %xmm2 ; X86-SSE42-NEXT: movl 48(%edx), %esi ; X86-SSE42-NEXT: movl 52(%edx), %edi ; X86-SSE42-NEXT: movl 56(%edx), %ebx ; X86-SSE42-NEXT: movl 60(%edx), %edx ; X86-SSE42-NEXT: movl (%ecx), %ecx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movaps %xmm0, (%esp) ; X86-SSE42-NEXT: sarl $31, %edx ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-SSE42-NEXT: andl $7, %ecx ; X86-SSE42-NEXT: movups (%esp,%ecx,8), %xmm0 ; X86-SSE42-NEXT: movups 16(%esp,%ecx,8), %xmm1 ; X86-SSE42-NEXT: movups 32(%esp,%ecx,8), %xmm2 ; X86-SSE42-NEXT: movups 48(%esp,%ecx,8), %xmm3 ; X86-SSE42-NEXT: movups %xmm3, 48(%eax) ; X86-SSE42-NEXT: movups %xmm2, 32(%eax) ; X86-SSE42-NEXT: movups %xmm1, 16(%eax) ; X86-SSE42-NEXT: movups %xmm0, (%eax) ; X86-SSE42-NEXT: addl $128, %esp ; X86-SSE42-NEXT: popl %esi ; X86-SSE42-NEXT: popl %edi ; X86-SSE42-NEXT: popl %ebx ; X86-SSE42-NEXT: retl ; ; X86-AVX-LABEL: ashr_64bytes_qwordOff: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: pushl %ebx ; X86-AVX-NEXT: pushl %edi ; X86-AVX-NEXT: pushl %esi ; X86-AVX-NEXT: subl $128, %esp ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovups (%edx), %ymm0 ; X86-AVX-NEXT: vmovups 32(%edx), %xmm1 ; X86-AVX-NEXT: movl 48(%edx), %esi ; X86-AVX-NEXT: movl 52(%edx), %edi ; X86-AVX-NEXT: movl 56(%edx), %ebx ; X86-AVX-NEXT: movl 60(%edx), %edx ; X86-AVX-NEXT: movl (%ecx), %ecx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: vmovups %ymm0, (%esp) ; X86-AVX-NEXT: sarl $31, %edx ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-AVX-NEXT: andl $7, %ecx ; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0 ; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1 ; X86-AVX-NEXT: vmovups 32(%esp,%ecx,8), %xmm2 ; X86-AVX-NEXT: vmovups 48(%esp,%ecx,8), %xmm3 ; X86-AVX-NEXT: vmovups %xmm3, 48(%eax) ; X86-AVX-NEXT: vmovups %xmm2, 32(%eax) ; X86-AVX-NEXT: vmovups %xmm1, 16(%eax) ; X86-AVX-NEXT: vmovups %xmm0, (%eax) ; X86-AVX-NEXT: addl $128, %esp ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: popl %edi ; X86-AVX-NEXT: popl %ebx ; X86-AVX-NEXT: vzeroupper ; X86-AVX-NEXT: retl %src = load i512, ptr %src.ptr, align 1 %qwordOff = load i512, ptr %qwordOff.ptr, align 1 %bitOff = shl i512 %qwordOff, 6 %res = ashr i512 %src, %bitOff store i512 %res, ptr %dst, align 1 ret void } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; ALL: {{.*}} ; X64: {{.*}} ; X86: {{.*}}